diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index dfe6f65d240e6..23ead74c34ca6 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -11131,7 +11131,7 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { assert(VT.getSizeInBits() == 64); SDLoc DL(Op); - SDValue Cond = Op.getOperand(0); + SDValue Cond = DAG.getFreeze(Op.getOperand(0)); SDValue Zero = DAG.getConstant(0, DL, MVT::i32); SDValue One = DAG.getConstant(1, DL, MVT::i32); diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index b7097a9557b75..c7385e4324e2c 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -7791,7 +7791,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX6-LABEL: sdiv_i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s0, s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7927,7 +7927,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX9-LABEL: sdiv_i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 @@ -8982,7 +8982,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX6-LABEL: srem_i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s0, s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9116,7 +9116,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX9-LABEL: srem_i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 @@ -10096,9 +10096,15 @@ define i64 @udiv_i64_9divbits(i8 %size) { } define <2 x i64> @srem_zero_zero() { -; GCN-LABEL: kernel: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_endpgm +; GFX6-LABEL: srem_zero_zero: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: srem_zero_zero: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] entry: %B = srem <2 x i64> zeroinitializer, zeroinitializer ret <2 x i64> %B diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index f8e13fcdd2273..4cb0d2d7b3789 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -521,16 +521,19 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 ; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7] +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13] +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[12:13] ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9] ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 @@ -2710,16 +2713,19 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 ; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7] +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13] +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[12:13] ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9] ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll index 069a47ec97bfe..e5fe4160a4b05 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll @@ -3272,9 +3272,10 @@ define double @v_fmaximum3_f64_fabs0(double %a, double %b, double %c) { ; GFX9-LABEL: v_fmaximum3_f64_fabs0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f64 v[6:7], |v[0:1]|, v[2:3] +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, v[2:3] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc @@ -3306,9 +3307,10 @@ define double @v_fmaximum3_f64_fabs1(double %a, double %b, double %c) { ; GFX9-LABEL: v_fmaximum3_f64_fabs1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], |v[2:3]| +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[2:3]| +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc @@ -3343,11 +3345,12 @@ define double @v_fmaximum3_f64_fabs2(double %a, double %b, double %c) { ; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc -; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], |v[4:5]| -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]| +; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc @@ -3374,14 +3377,17 @@ define double @v_fmaximum3_f64_fabs_all(double %a, double %b, double %c) { ; GFX9-LABEL: v_fmaximum3_f64_fabs_all: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f64 v[6:7], |v[0:1]|, |v[2:3]| +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[2:3]| -; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc -; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], |v[4:5]| -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]| +; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc @@ -3446,14 +3452,17 @@ define double @v_fmaximum3_f64_fneg_fabs_all(double %a, double %b, double %c) { ; GFX9-LABEL: v_fmaximum3_f64_fneg_fabs_all: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f64 v[6:7], -|v[0:1]|, -|v[2:3]| +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX9-NEXT: v_max_f64 v[6:7], -v[0:1], -v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -|v[0:1]|, -|v[2:3]| -; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[2:3] +; GFX9-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc -; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], -|v[4:5]| -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -|v[4:5]| +; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], -v[4:5] +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -v[4:5] ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll index d8746b58b16b7..6873c617c64a1 100644 --- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll @@ -3272,9 +3272,10 @@ define double @v_fminimum3_f64_fabs0(double %a, double %b, double %c) { ; GFX9-LABEL: v_fminimum3_f64_fabs0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f64 v[6:7], |v[0:1]|, v[2:3] +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, v[2:3] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc @@ -3306,9 +3307,10 @@ define double @v_fminimum3_f64_fabs1(double %a, double %b, double %c) { ; GFX9-LABEL: v_fminimum3_f64_fabs1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], |v[2:3]| +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[2:3]| +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc @@ -3343,11 +3345,12 @@ define double @v_fminimum3_f64_fabs2(double %a, double %b, double %c) { ; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc -; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], |v[4:5]| -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]| +; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc @@ -3374,14 +3377,17 @@ define double @v_fminimum3_f64_fabs_all(double %a, double %b, double %c) { ; GFX9-LABEL: v_fminimum3_f64_fabs_all: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f64 v[6:7], |v[0:1]|, |v[2:3]| +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[2:3]| -; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc -; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], |v[4:5]| -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]| +; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc @@ -3446,14 +3452,17 @@ define double @v_fminimum3_f64_fneg_fabs_all(double %a, double %b, double %c) { ; GFX9-LABEL: v_fminimum3_f64_fneg_fabs_all: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f64 v[6:7], -|v[0:1]|, -|v[2:3]| +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX9-NEXT: v_min_f64 v[6:7], -v[0:1], -v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -|v[0:1]|, -|v[2:3]| -; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[2:3] +; GFX9-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc -; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], -|v[4:5]| -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -|v[4:5]| +; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], -v[4:5] +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -v[4:5] ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll index e9fd6119d0c36..193cee967f3c4 100644 --- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll +++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll @@ -223,8 +223,9 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) { ; SI-NEXT: v_bfi_b32 v1, s8, v1, v6 ; SI-NEXT: v_mov_b32_e32 v7, s2 ; SI-NEXT: v_add_f64 v[4:5], s[2:3], v[0:1] +; SI-NEXT: s_bitset0_b32 s3, 31 ; SI-NEXT: v_add_f64 v[0:1], v[4:5], -v[0:1] -; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[2:3] +; SI-NEXT: v_cmp_gt_f64_e32 vcc, s[2:3], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -284,14 +285,16 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> % ; SI-NEXT: v_mov_b32_e32 v9, s5 ; SI-NEXT: v_mov_b32_e32 v10, s4 ; SI-NEXT: v_add_f64 v[2:3], s[6:7], v[0:1] +; SI-NEXT: s_bitset0_b32 s7, 31 ; SI-NEXT: v_add_f64 v[2:3], v[2:3], -v[0:1] ; SI-NEXT: v_bfi_b32 v1, s10, v6, v9 -; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[6:7]|, v[4:5] +; SI-NEXT: v_cmp_gt_f64_e32 vcc, s[6:7], v[4:5] ; SI-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc ; SI-NEXT: v_add_f64 v[6:7], s[4:5], v[0:1] +; SI-NEXT: s_bitset0_b32 s5, 31 ; SI-NEXT: v_add_f64 v[0:1], v[6:7], -v[0:1] -; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[4:5] +; SI-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[4:5] ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 @@ -365,26 +368,30 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> % ; SI-NEXT: v_mov_b32_e32 v14, s5 ; SI-NEXT: v_mov_b32_e32 v15, s4 ; SI-NEXT: v_add_f64 v[0:1], s[2:3], v[4:5] +; SI-NEXT: s_bitset0_b32 s3, 31 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5] ; SI-NEXT: v_bfi_b32 v5, s14, v10, v7 -; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[8:9] +; SI-NEXT: v_cmp_gt_f64_e32 vcc, s[2:3], v[8:9] ; SI-NEXT: v_cndmask_b32_e32 v3, v1, v2, vcc ; SI-NEXT: v_cndmask_b32_e32 v2, v0, v6, vcc ; SI-NEXT: v_add_f64 v[0:1], s[0:1], v[4:5] +; SI-NEXT: s_bitset0_b32 s1, 31 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5] ; SI-NEXT: v_bfi_b32 v5, s14, v10, v12 -; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[0:1]|, v[8:9] +; SI-NEXT: v_cmp_gt_f64_e32 vcc, s[0:1], v[8:9] ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc ; SI-NEXT: v_add_f64 v[6:7], s[6:7], v[4:5] +; SI-NEXT: s_bitset0_b32 s7, 31 ; SI-NEXT: v_add_f64 v[6:7], v[6:7], -v[4:5] ; SI-NEXT: v_bfi_b32 v5, s14, v10, v14 -; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[6:7]|, v[8:9] +; SI-NEXT: v_cmp_gt_f64_e32 vcc, s[6:7], v[8:9] ; SI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc ; SI-NEXT: v_cndmask_b32_e32 v6, v6, v13, vcc ; SI-NEXT: v_add_f64 v[10:11], s[4:5], v[4:5] +; SI-NEXT: s_bitset0_b32 s5, 31 ; SI-NEXT: v_add_f64 v[4:5], v[10:11], -v[4:5] -; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[8:9] +; SI-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[8:9] ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v14, vcc ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v15, vcc ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll index d97ea042b50fc..f50944cc8a5b1 100644 --- a/llvm/test/CodeGen/AMDGPU/fract-match.ll +++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll @@ -2356,10 +2356,11 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) writeonly capture ; GFX6-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] ; GFX6-NEXT: v_min_f64 v[6:7], v[6:7], s[8:9] ; GFX6-NEXT: s_mov_b32 s8, 0 -; GFX6-NEXT: s_mov_b32 s9, 0x7ff00000 ; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX6-NEXT: s_mov_b32 s9, 0x7ff00000 ; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc -; GFX6-NEXT: v_cmp_neq_f64_e64 vcc, |v[0:1]|, s[8:9] +; GFX6-NEXT: v_cmp_neq_f64_e32 vcc, s[8:9], v[0:1] ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 @@ -2374,17 +2375,18 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) writeonly capture ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, 0 +; GFX7-NEXT: v_floor_f64_e32 v[4:5], v[0:1] +; GFX7-NEXT: v_fract_f64_e32 v[6:7], v[0:1] +; GFX7-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 ; GFX7-NEXT: s_mov_b32 s5, 0x7ff00000 -; GFX7-NEXT: v_fract_f64_e32 v[4:5], v[0:1] -; GFX7-NEXT: v_cmp_neq_f64_e64 vcc, |v[0:1]|, s[4:5] -; GFX7-NEXT: v_floor_f64_e32 v[6:7], v[0:1] +; GFX7-NEXT: v_cmp_neq_f64_e32 vcc, s[4:5], v[0:1] ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc -; GFX7-NEXT: buffer_store_dwordx2 v[6:7], v[2:3], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_store_dwordx2 v[4:5], v[2:3], s[4:7], 0 addr64 +; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -2392,25 +2394,27 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) writeonly capture ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_floor_f64_e32 v[4:5], v[0:1] +; GFX8-NEXT: v_fract_f64_e32 v[6:7], v[0:1] +; GFX8-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 ; GFX8-NEXT: s_mov_b32 s5, 0x7ff00000 -; GFX8-NEXT: v_fract_f64_e32 v[4:5], v[0:1] -; GFX8-NEXT: v_cmp_neq_f64_e64 vcc, |v[0:1]|, s[4:5] -; GFX8-NEXT: v_floor_f64_e32 v[6:7], v[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc -; GFX8-NEXT: global_store_dwordx2 v[2:3], v[6:7], off +; GFX8-NEXT: v_cmp_neq_f64_e32 vcc, s[4:5], v[0:1] +; GFX8-NEXT: global_store_dwordx2 v[2:3], v[4:5], off +; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: safe_math_fract_f64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fract_f64_e32 v[4:5], v[0:1] -; GFX11-NEXT: v_cmp_neq_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]| -; GFX11-NEXT: v_floor_f64_e32 v[6:7], v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_dual_cndmask_b32 v0, 0, v4 :: v_dual_cndmask_b32 v1, 0, v5 -; GFX11-NEXT: global_store_b64 v[2:3], v[6:7], off +; GFX11-NEXT: v_floor_f64_e32 v[4:5], v[0:1] +; GFX11-NEXT: v_fract_f64_e32 v[6:7], v[0:1] +; GFX11-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmp_neq_f64_e32 vcc_lo, 0x7ff00000, v[0:1] +; GFX11-NEXT: global_store_b64 v[2:3], v[4:5], off +; GFX11-NEXT: v_dual_cndmask_b32 v0, 0, v6 :: v_dual_cndmask_b32 v1, 0, v7 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: safe_math_fract_f64: @@ -2420,13 +2424,14 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) writeonly capture ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_fract_f64_e32 v[4:5], v[0:1] -; GFX12-NEXT: v_cmp_neq_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]| -; GFX12-NEXT: v_floor_f64_e32 v[6:7], v[0:1] +; GFX12-NEXT: v_floor_f64_e32 v[4:5], v[0:1] +; GFX12-NEXT: v_fract_f64_e32 v[6:7], v[0:1] +; GFX12-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmp_neq_f64_e32 vcc_lo, 0x7ff00000, v[0:1] +; GFX12-NEXT: global_store_b64 v[2:3], v[4:5], off ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-NEXT: v_dual_cndmask_b32 v0, 0, v4 :: v_dual_cndmask_b32 v1, 0, v5 -; GFX12-NEXT: global_store_b64 v[2:3], v[6:7], off +; GFX12-NEXT: v_dual_cndmask_b32 v0, 0, v6 :: v_dual_cndmask_b32 v1, 0, v7 ; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call double @llvm.floor.f64(double %x) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll index 3a4bf1c81ed58..0bb973c0e5512 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll @@ -1759,11 +1759,13 @@ define double @test_frexp_f64_i32_only_use_fract(double %a) { ; GFX6-SDAG: ; %bb.0: ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-SDAG-NEXT: s_mov_b32 s4, 0 +; GFX6-SDAG-NEXT: v_and_b32_e32 v3, 0x7fffffff, v1 +; GFX6-SDAG-NEXT: v_mov_b32_e32 v2, v0 ; GFX6-SDAG-NEXT: s_mov_b32 s5, 0x7ff00000 -; GFX6-SDAG-NEXT: v_frexp_mant_f64_e32 v[2:3], v[0:1] -; GFX6-SDAG-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] -; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX6-SDAG-NEXT: v_frexp_mant_f64_e32 v[4:5], v[0:1] +; GFX6-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_frexp_f64_i32_only_use_fract: @@ -1959,20 +1961,24 @@ define { <2 x double>, <2 x i32> } @test_frexp_v2f64_v2i32(<2 x double> %a) { } define <2 x double> @test_frexp_v2f64_v2i32_only_use_fract(<2 x double> %a) { -; GFX6-LABEL: test_frexp_v2f64_v2i32_only_use_fract: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0 -; GFX6-NEXT: s_mov_b32 s5, 0x7ff00000 -; GFX6-NEXT: v_frexp_mant_f64_e32 v[4:5], v[0:1] -; GFX6-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX6-NEXT: v_frexp_mant_f64_e32 v[4:5], v[2:3] -; GFX6-NEXT: v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX6-NEXT: s_setpc_b64 s[30:31] +; GFX6-SDAG-LABEL: test_frexp_v2f64_v2i32_only_use_fract: +; GFX6-SDAG: ; %bb.0: +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-SDAG-NEXT: s_mov_b32 s4, 0 +; GFX6-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v1 +; GFX6-SDAG-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-SDAG-NEXT: s_mov_b32 s5, 0x7ff00000 +; GFX6-SDAG-NEXT: v_frexp_mant_f64_e32 v[6:7], v[0:1] +; GFX6-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[4:5] +; GFX6-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v3 +; GFX6-SDAG-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX6-SDAG-NEXT: v_frexp_mant_f64_e32 v[6:7], v[2:3] +; GFX6-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[4:5] +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_frexp_v2f64_v2i32_only_use_fract: ; GFX8: ; %bb.0: @@ -2005,6 +2011,21 @@ define <2 x double> @test_frexp_v2f64_v2i32_only_use_fract(<2 x double> %a) { ; GFX12-NEXT: v_frexp_mant_f64_e32 v[0:1], v[0:1] ; GFX12-NEXT: v_frexp_mant_f64_e32 v[2:3], v[2:3] ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-GISEL-LABEL: test_frexp_v2f64_v2i32_only_use_fract: +; GFX6-GISEL: ; %bb.0: +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-GISEL-NEXT: s_mov_b32 s4, 0 +; GFX6-GISEL-NEXT: s_mov_b32 s5, 0x7ff00000 +; GFX6-GISEL-NEXT: v_frexp_mant_f64_e32 v[4:5], v[0:1] +; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX6-GISEL-NEXT: v_frexp_mant_f64_e32 v[4:5], v[2:3] +; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[4:5] +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] %result = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %a) %result.0 = extractvalue { <2 x double>, <2 x i32> } %result, 0 ret <2 x double> %result.0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll index 28781ae9f13c7..53660ffffa691 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s ; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s @@ -7,7 +8,7 @@ ; SI-DAG: v_add_f64 ; SI-DAG: v_add_f64 -; SI-DAG: v_cmp_gt_f64_e64 +; SI-DAG: v_cmp_gt_f64_e32 ; SI: v_cndmask_b32 ; SI: v_cndmask_b32 ; SI: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll index af914bd4043cf..2500af1ae109f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -9,32 +9,33 @@ define amdgpu_kernel void @round_f64(ptr addrspace(1) %out, double %x) #0 { ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s5, 0xfffff ; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s7, s3, 0xb0014 -; SI-NEXT: s_addk_i32 s7, 0xfc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s7 -; SI-NEXT: s_and_b32 s8, s3, 0x80000000 +; SI-NEXT: s_bfe_u32 s8, s3, 0xb0014 +; SI-NEXT: s_addk_i32 s8, 0xfc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 ; SI-NEXT: s_andn2_b64 s[4:5], s[2:3], s[4:5] -; SI-NEXT: s_cmp_lt_i32 s7, 0 +; SI-NEXT: s_and_b32 s9, s3, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s8, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 -; SI-NEXT: s_cselect_b32 s5, s8, s5 -; SI-NEXT: s_cmp_gt_i32 s7, 51 +; SI-NEXT: s_cselect_b32 s5, s9, s5 +; SI-NEXT: s_cmp_gt_i32 s8, 51 ; SI-NEXT: s_cselect_b32 s8, s2, s4 ; SI-NEXT: s_cselect_b32 s9, s3, s5 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_add_f64 v[0:1], s[2:3], -v[0:1] ; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[0:1]|, 0.5 -; SI-NEXT: s_brev_b32 s2, -2 -; SI-NEXT: s_and_b64 s[10:11], s[10:11], exec -; SI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: s_and_b64 s[10:11], vcc, exec +; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_bfi_b32 v1, s2, v0, v1 +; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: v_add_f64 v[0:1], s[8:9], v[0:1] -; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -49,9 +50,10 @@ define amdgpu_kernel void @round_f64(ptr addrspace(1) %out, double %x) #0 { ; CI-NEXT: v_trunc_f64_e32 v[0:1], s[2:3] ; CI-NEXT: s_mov_b32 s4, s0 ; CI-NEXT: v_add_f64 v[2:3], s[2:3], -v[0:1] -; CI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[2:3]|, 0.5 +; CI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[2:3] ; CI-NEXT: v_mov_b32_e32 v2, s3 -; CI-NEXT: s_and_b64 s[2:3], s[8:9], exec +; CI-NEXT: s_and_b64 s[2:3], vcc, exec ; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 ; CI-NEXT: v_mov_b32_e32 v3, s0 ; CI-NEXT: v_bfi_b32 v3, s5, v3, v2 @@ -78,7 +80,7 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s3, 0xfffff -; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 +; SI-NEXT: s_brev_b32 s4, -2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfe_u32 v4, v3, 20, 11 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0xfffffc01, v4 @@ -95,13 +97,14 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; SI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5] -; SI-NEXT: s_brev_b32 s2, -2 -; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 -; SI-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc -; SI-NEXT: v_bfi_b32 v3, s2, v2, v3 +; SI-NEXT: v_mov_b32_e32 v2, 0x3ff00000 +; SI-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7 +; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[6:7] +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-NEXT: v_bfi_b32 v3, s4, v2, v3 ; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] -; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; @@ -115,13 +118,14 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b64 s[4:5], s[2:3] ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 -; CI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 ; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_trunc_f64_e32 v[4:5], v[2:3] ; CI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5] -; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 -; CI-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc +; CI-NEXT: v_mov_b32_e32 v2, 0x3ff00000 +; CI-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7 +; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[6:7] +; CI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; CI-NEXT: v_bfi_b32 v3, s2, v2, v3 ; CI-NEXT: v_mov_b32_e32 v2, v1 ; CI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] @@ -160,35 +164,37 @@ define amdgpu_kernel void @round_v2f64(ptr addrspace(1) %out, <2 x double> %in) ; SI-NEXT: v_mov_b32_e32 v1, s13 ; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-NEXT: v_cmp_ge_f64_e64 s[14:15], |v[0:1]|, 0.5 -; SI-NEXT: s_brev_b32 s10, -2 -; SI-NEXT: s_and_b64 s[4:5], s[14:15], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: s_bfe_u32 s3, s9, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s3 +; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] +; SI-NEXT: s_brev_b32 s3, -2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_bfe_u32 s4, s9, 0xb0014 +; SI-NEXT: s_add_i32 s10, s4, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s10 ; SI-NEXT: s_andn2_b64 s[4:5], s[8:9], s[4:5] ; SI-NEXT: s_and_b32 s6, s9, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_cmp_lt_i32 s10, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 ; SI-NEXT: s_cselect_b32 s5, s6, s5 -; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cmp_gt_i32 s10, 51 ; SI-NEXT: s_cselect_b32 s4, s8, s4 ; SI-NEXT: s_cselect_b32 s5, s9, s5 ; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: v_mov_b32_e32 v3, s5 ; SI-NEXT: v_add_f64 v[2:3], s[8:9], -v[2:3] ; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[2:3]|, 0.5 -; SI-NEXT: v_bfi_b32 v1, s10, v0, v1 -; SI-NEXT: s_and_b64 s[6:7], s[6:7], exec +; SI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[2:3] +; SI-NEXT: v_bfi_b32 v1, s3, v0, v1 +; SI-NEXT: s_and_b64 s[6:7], vcc, exec ; SI-NEXT: v_mov_b32_e32 v0, 0 -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 ; SI-NEXT: v_add_f64 v[2:3], s[12:13], v[0:1] -; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_mov_b32_e32 v1, s6 ; SI-NEXT: v_mov_b32_e32 v4, s9 -; SI-NEXT: v_bfi_b32 v1, s10, v1, v4 +; SI-NEXT: v_bfi_b32 v1, s3, v1, v4 ; SI-NEXT: v_add_f64 v[0:1], s[4:5], v[0:1] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -207,14 +213,16 @@ define amdgpu_kernel void @round_v2f64(ptr addrspace(1) %out, <2 x double> %in) ; CI-NEXT: v_trunc_f64_e32 v[6:7], s[8:9] ; CI-NEXT: v_add_f64 v[4:5], s[10:11], -v[2:3] ; CI-NEXT: v_mov_b32_e32 v1, s11 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[4:5]|, 0.5 +; CI-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[4:5] ; CI-NEXT: v_add_f64 v[4:5], s[8:9], -v[6:7] -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: s_and_b64 s[4:5], vcc, exec +; CI-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[4:5] ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_mov_b32_e32 v8, s4 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[4:5]|, 0.5 +; CI-NEXT: s_and_b64 s[4:5], vcc, exec ; CI-NEXT: v_bfi_b32 v1, s2, v8, v1 -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[2:3], v[2:3], v[0:1] ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -253,76 +261,80 @@ define amdgpu_kernel void @round_v4f64(ptr addrspace(1) %out, <4 x double> %in) ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-NEXT: v_cmp_ge_f64_e64 s[18:19], |v[0:1]|, 0.5 +; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] ; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: s_and_b64 s[4:5], s[18:19], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: s_bfe_u32 s3, s9, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s3 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_bfe_u32 s4, s9, 0xb0014 +; SI-NEXT: s_add_i32 s10, s4, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s10 ; SI-NEXT: s_andn2_b64 s[4:5], s[8:9], s[4:5] -; SI-NEXT: s_and_b32 s10, s9, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_and_b32 s11, s9, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s10, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 -; SI-NEXT: s_cselect_b32 s5, s10, s5 -; SI-NEXT: s_cmp_gt_i32 s3, 51 -; SI-NEXT: s_brev_b32 s18, -2 +; SI-NEXT: s_cselect_b32 s5, s11, s5 +; SI-NEXT: s_cmp_gt_i32 s10, 51 +; SI-NEXT: s_brev_b32 s3, -2 ; SI-NEXT: s_cselect_b32 s4, s8, s4 -; SI-NEXT: v_bfi_b32 v5, s18, v0, v1 +; SI-NEXT: v_bfi_b32 v5, s3, v0, v1 ; SI-NEXT: s_cselect_b32 s5, s9, s5 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_add_f64 v[0:1], s[8:9], -v[0:1] ; SI-NEXT: v_add_f64 v[2:3], s[16:17], v[4:5] -; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[0:1]|, 0.5 +; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] ; SI-NEXT: v_mov_b32_e32 v6, s9 -; SI-NEXT: s_and_b64 s[10:11], s[10:11], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v5, s3 -; SI-NEXT: s_bfe_u32 s3, s15, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[8:9], s[6:7], s3 +; SI-NEXT: s_and_b64 s[10:11], vcc, exec +; SI-NEXT: s_cselect_b32 s8, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: s_bfe_u32 s8, s15, 0xb0014 +; SI-NEXT: s_add_i32 s10, s8, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[8:9], s[6:7], s10 ; SI-NEXT: s_andn2_b64 s[8:9], s[14:15], s[8:9] -; SI-NEXT: s_and_b32 s10, s15, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_and_b32 s11, s15, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s10, 0 ; SI-NEXT: s_cselect_b32 s8, 0, s8 -; SI-NEXT: s_cselect_b32 s9, s10, s9 -; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s9, s11, s9 +; SI-NEXT: s_cmp_gt_i32 s10, 51 ; SI-NEXT: s_cselect_b32 s8, s14, s8 ; SI-NEXT: s_cselect_b32 s9, s15, s9 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_add_f64 v[0:1], s[14:15], -v[0:1] -; SI-NEXT: v_bfi_b32 v5, s18, v5, v6 -; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[0:1]|, 0.5 +; SI-NEXT: v_bfi_b32 v5, s3, v5, v6 +; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] ; SI-NEXT: v_add_f64 v[0:1], s[4:5], v[4:5] -; SI-NEXT: s_and_b64 s[4:5], s[10:11], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v8, s3 -; SI-NEXT: s_bfe_u32 s3, s13, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s3 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_bfe_u32 s4, s13, 0xb0014 +; SI-NEXT: s_add_i32 s10, s4, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s10 ; SI-NEXT: s_andn2_b64 s[4:5], s[12:13], s[4:5] ; SI-NEXT: s_and_b32 s6, s13, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_cmp_lt_i32 s10, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 ; SI-NEXT: s_cselect_b32 s5, s6, s5 -; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cmp_gt_i32 s10, 51 ; SI-NEXT: s_cselect_b32 s5, s13, s5 ; SI-NEXT: s_cselect_b32 s4, s12, s4 ; SI-NEXT: v_mov_b32_e32 v6, s5 ; SI-NEXT: v_mov_b32_e32 v5, s4 ; SI-NEXT: v_add_f64 v[6:7], s[12:13], -v[5:6] ; SI-NEXT: v_mov_b32_e32 v9, s15 -; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[6:7]|, 0.5 -; SI-NEXT: v_bfi_b32 v5, s18, v8, v9 -; SI-NEXT: s_and_b64 s[6:7], s[6:7], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7 +; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[6:7] +; SI-NEXT: v_bfi_b32 v5, s3, v8, v9 +; SI-NEXT: s_and_b64 s[6:7], vcc, exec +; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 ; SI-NEXT: v_add_f64 v[6:7], s[8:9], v[4:5] -; SI-NEXT: v_mov_b32_e32 v5, s3 +; SI-NEXT: v_mov_b32_e32 v5, s6 ; SI-NEXT: v_mov_b32_e32 v8, s13 -; SI-NEXT: v_bfi_b32 v5, s18, v5, v8 +; SI-NEXT: v_bfi_b32 v5, s3, v5, v8 ; SI-NEXT: v_add_f64 v[4:5], s[4:5], v[4:5] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -342,31 +354,35 @@ define amdgpu_kernel void @round_v4f64(ptr addrspace(1) %out, <4 x double> %in) ; CI-NEXT: v_trunc_f64_e32 v[6:7], s[8:9] ; CI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] ; CI-NEXT: v_mov_b32_e32 v5, s11 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[2:3]|, 0.5 +; CI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[2:3] ; CI-NEXT: v_add_f64 v[2:3], s[8:9], -v[6:7] -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: s_and_b64 s[4:5], vcc, exec +; CI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[2:3] ; CI-NEXT: v_mov_b32_e32 v8, s4 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[2:3]|, 0.5 ; CI-NEXT: v_bfi_b32 v5, s2, v8, v5 +; CI-NEXT: s_and_b64 s[4:5], vcc, exec ; CI-NEXT: v_trunc_f64_e32 v[8:9], s[14:15] -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; CI-NEXT: v_add_f64 v[0:1], s[14:15], -v[8:9] +; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] ; CI-NEXT: v_mov_b32_e32 v5, s4 ; CI-NEXT: v_mov_b32_e32 v10, s9 +; CI-NEXT: v_add_f64 v[0:1], s[14:15], -v[8:9] ; CI-NEXT: v_bfi_b32 v5, s2, v5, v10 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[0:1]|, 0.5 ; CI-NEXT: v_trunc_f64_e32 v[10:11], s[12:13] +; CI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] ; CI-NEXT: v_add_f64 v[0:1], v[6:7], v[4:5] -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec ; CI-NEXT: v_add_f64 v[6:7], s[12:13], -v[10:11] +; CI-NEXT: s_and_b64 s[4:5], vcc, exec +; CI-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7 +; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[6:7] ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_mov_b32_e32 v5, s4 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[6:7]|, 0.5 ; CI-NEXT: v_mov_b32_e32 v12, s15 -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: s_and_b64 s[4:5], vcc, exec ; CI-NEXT: v_bfi_b32 v5, s2, v5, v12 ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] @@ -407,9 +423,10 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_mov_b32_e32 v1, s25 ; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-NEXT: v_cmp_ge_f64_e64 s[26:27], |v[0:1]|, 0.5 +; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] ; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: s_and_b64 s[4:5], s[26:27], exec +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_bfe_u32 s4, s9, 0xb0014 @@ -429,9 +446,10 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_add_f64 v[0:1], s[8:9], -v[0:1] ; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[0:1]|, 0.5 +; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] ; SI-NEXT: v_add_f64 v[2:3], s[24:25], v[8:9] -; SI-NEXT: s_and_b64 s[10:11], s[10:11], exec +; SI-NEXT: s_and_b64 s[10:11], vcc, exec ; SI-NEXT: s_cselect_b32 s8, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: s_bfe_u32 s8, s15, 0xb0014 @@ -449,9 +467,10 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_add_f64 v[0:1], s[14:15], -v[0:1] ; SI-NEXT: v_bfi_b32 v9, s3, v4, v5 -; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[0:1]|, 0.5 +; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] ; SI-NEXT: v_add_f64 v[0:1], s[4:5], v[8:9] -; SI-NEXT: s_and_b64 s[4:5], s[10:11], exec +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v6, s4 ; SI-NEXT: s_bfe_u32 s4, s13, 0xb0014 @@ -469,10 +488,11 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_mov_b32_e32 v5, s5 ; SI-NEXT: v_add_f64 v[4:5], s[12:13], -v[4:5] ; SI-NEXT: v_mov_b32_e32 v7, s15 -; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[4:5]|, 0.5 +; SI-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[4:5] ; SI-NEXT: v_bfi_b32 v9, s3, v6, v7 ; SI-NEXT: v_add_f64 v[6:7], s[8:9], v[8:9] -; SI-NEXT: s_and_b64 s[8:9], s[10:11], exec +; SI-NEXT: s_and_b64 s[8:9], vcc, exec ; SI-NEXT: s_cselect_b32 s8, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v9, s8 ; SI-NEXT: s_bfe_u32 s8, s19, 0xb0014 @@ -490,10 +510,11 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_add_f64 v[4:5], s[18:19], -v[4:5] ; SI-NEXT: v_mov_b32_e32 v10, s13 -; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[4:5]|, 0.5 +; SI-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[4:5] ; SI-NEXT: v_bfi_b32 v9, s3, v9, v10 ; SI-NEXT: v_add_f64 v[4:5], s[4:5], v[8:9] -; SI-NEXT: s_and_b64 s[4:5], s[10:11], exec +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v12, s4 ; SI-NEXT: s_bfe_u32 s4, s17, 0xb0014 @@ -511,10 +532,11 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_mov_b32_e32 v9, s4 ; SI-NEXT: v_add_f64 v[10:11], s[16:17], -v[9:10] ; SI-NEXT: v_mov_b32_e32 v13, s19 -; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[10:11]|, 0.5 +; SI-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11 +; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[10:11] ; SI-NEXT: v_bfi_b32 v9, s3, v12, v13 ; SI-NEXT: v_add_f64 v[12:13], s[8:9], v[8:9] -; SI-NEXT: s_and_b64 s[8:9], s[10:11], exec +; SI-NEXT: s_and_b64 s[8:9], vcc, exec ; SI-NEXT: s_cselect_b32 s8, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v14, s8 ; SI-NEXT: s_bfe_u32 s8, s23, 0xb0014 @@ -532,10 +554,11 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_mov_b32_e32 v9, s8 ; SI-NEXT: v_add_f64 v[10:11], s[22:23], -v[9:10] ; SI-NEXT: v_mov_b32_e32 v15, s17 -; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[10:11]|, 0.5 +; SI-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11 +; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[10:11] ; SI-NEXT: v_bfi_b32 v9, s3, v14, v15 ; SI-NEXT: v_add_f64 v[10:11], s[4:5], v[8:9] -; SI-NEXT: s_and_b64 s[4:5], s[10:11], exec +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v9, s4 ; SI-NEXT: s_bfe_u32 s4, s21, 0xb0014 @@ -553,9 +576,10 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_mov_b32_e32 v14, s4 ; SI-NEXT: v_add_f64 v[14:15], s[20:21], -v[14:15] ; SI-NEXT: v_mov_b32_e32 v16, s23 -; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[14:15]|, 0.5 +; SI-NEXT: v_and_b32_e32 v15, 0x7fffffff, v15 +; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[14:15] ; SI-NEXT: v_bfi_b32 v9, s3, v9, v16 -; SI-NEXT: s_and_b64 s[6:7], s[6:7], exec +; SI-NEXT: s_and_b64 s[6:7], vcc, exec ; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 ; SI-NEXT: v_add_f64 v[16:17], s[8:9], v[8:9] ; SI-NEXT: v_mov_b32_e32 v9, s6 @@ -574,87 +598,95 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 ; CI-NEXT: s_brev_b32 s6, -2 -; CI-NEXT: v_mov_b32_e32 v12, 0 +; CI-NEXT: v_mov_b32_e32 v4, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_trunc_f64_e32 v[0:1], s[10:11] -; CI-NEXT: v_trunc_f64_e32 v[4:5], s[8:9] +; CI-NEXT: v_trunc_f64_e32 v[6:7], s[8:9] ; CI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] -; CI-NEXT: v_add_f64 v[6:7], s[8:9], -v[4:5] -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[2:3]|, 0.5 -; CI-NEXT: v_cmp_ge_f64_e64 s[2:3], |v[6:7]|, 0.5 +; CI-NEXT: v_add_f64 v[8:9], s[8:9], -v[6:7] +; CI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[2:3] +; CI-NEXT: v_and_b32_e32 v9, 0x7fffffff, v9 +; CI-NEXT: s_and_b64 s[2:3], vcc, exec +; CI-NEXT: v_cmp_le_f64_e64 s[0:1], 0.5, v[8:9] +; CI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v5, s11 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_trunc_f64_e32 v[8:9], s[14:15] +; CI-NEXT: v_bfi_b32 v5, s6, v2, v5 ; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: s_cselect_b32 s7, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v8, s11 -; CI-NEXT: s_and_b64 s[0:1], s[2:3], exec -; CI-NEXT: v_mov_b32_e32 v2, s7 -; CI-NEXT: v_trunc_f64_e32 v[6:7], s[14:15] -; CI-NEXT: v_bfi_b32 v13, s6, v2, v8 +; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] ; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[12:13] -; CI-NEXT: v_mov_b32_e32 v8, s0 -; CI-NEXT: v_mov_b32_e32 v9, s9 -; CI-NEXT: v_add_f64 v[0:1], s[14:15], -v[6:7] -; CI-NEXT: v_bfi_b32 v13, s6, v8, v9 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[0:1]|, 0.5 -; CI-NEXT: v_add_f64 v[0:1], v[4:5], v[12:13] -; CI-NEXT: v_trunc_f64_e32 v[4:5], s[12:13] -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: v_add_f64 v[8:9], s[12:13], -v[4:5] +; CI-NEXT: v_add_f64 v[0:1], s[14:15], -v[8:9] +; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: v_mov_b32_e32 v10, s9 +; CI-NEXT: v_bfi_b32 v5, s6, v5, v10 +; CI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] +; CI-NEXT: v_add_f64 v[0:1], v[6:7], v[4:5] +; CI-NEXT: v_trunc_f64_e32 v[6:7], s[12:13] +; CI-NEXT: s_and_b64 s[0:1], vcc, exec +; CI-NEXT: v_add_f64 v[10:11], s[12:13], -v[6:7] ; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v10, s0 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[8:9]|, 0.5 -; CI-NEXT: v_trunc_f64_e32 v[8:9], s[18:19] -; CI-NEXT: v_mov_b32_e32 v11, s15 -; CI-NEXT: v_bfi_b32 v13, s6, v10, v11 -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: v_add_f64 v[10:11], s[18:19], -v[8:9] +; CI-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11 +; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[10:11] +; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: v_mov_b32_e32 v12, s15 +; CI-NEXT: s_and_b64 s[0:1], vcc, exec +; CI-NEXT: v_trunc_f64_e32 v[10:11], s[18:19] +; CI-NEXT: v_bfi_b32 v5, s6, v5, v12 ; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_add_f64 v[6:7], v[6:7], v[12:13] -; CI-NEXT: v_mov_b32_e32 v13, s0 +; CI-NEXT: v_add_f64 v[8:9], v[8:9], v[4:5] +; CI-NEXT: v_mov_b32_e32 v5, s0 ; CI-NEXT: v_mov_b32_e32 v14, s13 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[10:11]|, 0.5 -; CI-NEXT: v_bfi_b32 v13, s6, v13, v14 +; CI-NEXT: v_add_f64 v[12:13], s[18:19], -v[10:11] +; CI-NEXT: v_bfi_b32 v5, s6, v5, v14 ; CI-NEXT: v_trunc_f64_e32 v[14:15], s[16:17] -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: v_and_b32_e32 v13, 0x7fffffff, v13 +; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[12:13] +; CI-NEXT: v_add_f64 v[12:13], s[16:17], -v[14:15] +; CI-NEXT: s_and_b64 s[0:1], vcc, exec +; CI-NEXT: v_and_b32_e32 v13, 0x7fffffff, v13 +; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[12:13] ; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_add_f64 v[10:11], s[16:17], -v[14:15] -; CI-NEXT: v_add_f64 v[4:5], v[4:5], v[12:13] -; CI-NEXT: v_mov_b32_e32 v13, s0 +; CI-NEXT: v_add_f64 v[6:7], v[6:7], v[4:5] +; CI-NEXT: v_mov_b32_e32 v5, s0 ; CI-NEXT: v_mov_b32_e32 v16, s19 -; CI-NEXT: v_bfi_b32 v13, s6, v13, v16 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[10:11]|, 0.5 +; CI-NEXT: s_and_b64 s[0:1], vcc, exec +; CI-NEXT: v_bfi_b32 v5, s6, v5, v16 +; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 ; CI-NEXT: v_trunc_f64_e32 v[16:17], s[22:23] -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: v_add_f64 v[12:13], v[10:11], v[4:5] +; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: v_mov_b32_e32 v10, s17 +; CI-NEXT: v_bfi_b32 v5, s6, v5, v10 ; CI-NEXT: v_add_f64 v[18:19], s[22:23], -v[16:17] +; CI-NEXT: v_add_f64 v[10:11], v[14:15], v[4:5] +; CI-NEXT: v_trunc_f64_e32 v[14:15], s[20:21] +; CI-NEXT: v_and_b32_e32 v19, 0x7fffffff, v19 +; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[18:19] +; CI-NEXT: v_add_f64 v[18:19], s[20:21], -v[14:15] +; CI-NEXT: s_and_b64 s[0:1], vcc, exec +; CI-NEXT: v_and_b32_e32 v19, 0x7fffffff, v19 +; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[18:19] ; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_add_f64 v[10:11], v[8:9], v[12:13] -; CI-NEXT: v_mov_b32_e32 v8, s0 -; CI-NEXT: v_mov_b32_e32 v9, s17 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[18:19]|, 0.5 -; CI-NEXT: v_trunc_f64_e32 v[18:19], s[20:21] -; CI-NEXT: v_bfi_b32 v13, s6, v8, v9 -; CI-NEXT: v_add_f64 v[8:9], v[14:15], v[12:13] -; CI-NEXT: v_add_f64 v[13:14], s[20:21], -v[18:19] -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[13:14]|, 0.5 -; CI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: s_and_b64 s[0:1], vcc, exec +; CI-NEXT: v_mov_b32_e32 v18, s23 ; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v13, s2 -; CI-NEXT: v_mov_b32_e32 v14, s23 -; CI-NEXT: v_mov_b32_e32 v20, s0 +; CI-NEXT: v_bfi_b32 v5, s6, v5, v18 +; CI-NEXT: v_mov_b32_e32 v18, s0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; CI-NEXT: v_bfi_b32 v13, s6, v13, v14 -; CI-NEXT: v_mov_b32_e32 v21, s21 -; CI-NEXT: v_add_f64 v[14:15], v[16:17], v[12:13] -; CI-NEXT: v_bfi_b32 v13, s6, v20, v21 -; CI-NEXT: v_add_f64 v[12:13], v[18:19], v[12:13] +; CI-NEXT: v_mov_b32_e32 v19, s21 +; CI-NEXT: v_add_f64 v[16:17], v[16:17], v[4:5] +; CI-NEXT: v_bfi_b32 v5, s6, v18, v19 +; CI-NEXT: v_add_f64 v[14:15], v[14:15], v[4:5] ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 -; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 -; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; CI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 +; CI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 +; CI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_endpgm %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1 diff --git a/llvm/test/CodeGen/AMDGPU/lround.ll b/llvm/test/CodeGen/AMDGPU/lround.ll index 8036e32f90eb0..5e2412742ec69 100644 --- a/llvm/test/CodeGen/AMDGPU/lround.ll +++ b/llvm/test/CodeGen/AMDGPU/lround.ll @@ -101,7 +101,8 @@ define i32 @intrinsic_lround_i32_f64(double %arg) { ; GFX9-SDAG-NEXT: s_brev_b32 s4, -2 ; GFX9-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x3ff00000 -; GFX9-SDAG-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 +; GFX9-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; GFX9-SDAG-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[4:5] ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX9-SDAG-NEXT: v_bfi_b32 v1, s4, v0, v1 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 @@ -129,8 +130,9 @@ define i32 @intrinsic_lround_i32_f64(double %arg) { ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SDAG-NEXT: v_trunc_f64_e32 v[2:3], v[0:1] ; GFX10-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] -; GFX10-SDAG-NEXT: v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x3ff00000, s4 +; GFX10-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; GFX10-SDAG-NEXT: v_cmp_le_f64_e32 vcc_lo, 0.5, v[4:5] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x3ff00000, vcc_lo ; GFX10-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v0, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SDAG-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] @@ -156,9 +158,10 @@ define i32 @intrinsic_lround_i32_f64(double %arg) { ; GFX11-SDAG-NEXT: v_trunc_f64_e32 v[2:3], v[0:1] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] -; GFX11-SDAG-NEXT: v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x3ff00000, s0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cmp_le_f64_e32 vcc_lo, 0.5, v[4:5] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x3ff00000, vcc_lo ; GFX11-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v0, v1 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -352,7 +355,8 @@ define i64 @intrinsic_lround_i64_f64(double %arg) { ; GFX9-SDAG-NEXT: s_brev_b32 s4, -2 ; GFX9-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-SDAG-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 +; GFX9-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; GFX9-SDAG-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[4:5] ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v6, vcc ; GFX9-SDAG-NEXT: v_bfi_b32 v1, s4, v4, v1 ; GFX9-SDAG-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] @@ -397,8 +401,9 @@ define i64 @intrinsic_lround_i64_f64(double %arg) { ; GFX10-SDAG-NEXT: v_trunc_f64_e32 v[2:3], v[0:1] ; GFX10-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-SDAG-NEXT: v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, s4 +; GFX10-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; GFX10-SDAG-NEXT: v_cmp_le_f64_e32 vcc_lo, 0.5, v[4:5] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, vcc_lo ; GFX10-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v4, v1 ; GFX10-SDAG-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] ; GFX10-SDAG-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] @@ -431,12 +436,12 @@ define i64 @intrinsic_lround_i64_f64(double %arg) { ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_trunc_f64_e32 v[2:3], v[0:1] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, s0 +; GFX11-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v5, 0x7fffffff, v5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cmp_le_f64_e32 vcc_lo, 0.5, v[4:5] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, vcc_lo ; GFX11-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v4, v1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] @@ -643,7 +648,8 @@ define i64 @intrinsic_llround_i64_f64(double %arg) { ; GFX9-SDAG-NEXT: s_brev_b32 s4, -2 ; GFX9-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-SDAG-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 +; GFX9-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; GFX9-SDAG-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[4:5] ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v6, vcc ; GFX9-SDAG-NEXT: v_bfi_b32 v1, s4, v4, v1 ; GFX9-SDAG-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] @@ -688,8 +694,9 @@ define i64 @intrinsic_llround_i64_f64(double %arg) { ; GFX10-SDAG-NEXT: v_trunc_f64_e32 v[2:3], v[0:1] ; GFX10-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-SDAG-NEXT: v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, s4 +; GFX10-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; GFX10-SDAG-NEXT: v_cmp_le_f64_e32 vcc_lo, 0.5, v[4:5] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, vcc_lo ; GFX10-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v4, v1 ; GFX10-SDAG-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] ; GFX10-SDAG-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] @@ -722,12 +729,12 @@ define i64 @intrinsic_llround_i64_f64(double %arg) { ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_trunc_f64_e32 v[2:3], v[0:1] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, s0 +; GFX11-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v5, 0x7fffffff, v5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cmp_le_f64_e32 vcc_lo, 0.5, v[4:5] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, vcc_lo ; GFX11-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v4, v1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index ba9dd8f7c2468..5d0e4bf1d34d0 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -559,16 +559,19 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 ; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7] +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13] +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[12:13] ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9] ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 @@ -1943,16 +1946,19 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 ; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7] +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13] +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[12:13] ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9] ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 diff --git a/llvm/test/CodeGen/AMDGPU/roundeven.ll b/llvm/test/CodeGen/AMDGPU/roundeven.ll index 59a1fe041bf90..3b9462cd690d5 100644 --- a/llvm/test/CodeGen/AMDGPU/roundeven.ll +++ b/llvm/test/CodeGen/AMDGPU/roundeven.ll @@ -1125,16 +1125,18 @@ define double @v_roundeven_f64(double %x) { ; SDAG_GFX6: ; %bb.0: ; SDAG_GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG_GFX6-NEXT: s_brev_b32 s6, -2 -; SDAG_GFX6-NEXT: v_mov_b32_e32 v2, 0x43300000 -; SDAG_GFX6-NEXT: v_bfi_b32 v3, s6, v2, v1 -; SDAG_GFX6-NEXT: v_mov_b32_e32 v2, 0 +; SDAG_GFX6-NEXT: v_mov_b32_e32 v4, 0x43300000 +; SDAG_GFX6-NEXT: v_bfi_b32 v5, s6, v4, v1 +; SDAG_GFX6-NEXT: v_mov_b32_e32 v4, 0 ; SDAG_GFX6-NEXT: s_mov_b32 s4, -1 -; SDAG_GFX6-NEXT: v_add_f64 v[4:5], v[0:1], v[2:3] +; SDAG_GFX6-NEXT: v_add_f64 v[6:7], v[0:1], v[4:5] +; SDAG_GFX6-NEXT: v_and_b32_e32 v3, 0x7fffffff, v1 +; SDAG_GFX6-NEXT: v_mov_b32_e32 v2, v0 ; SDAG_GFX6-NEXT: s_mov_b32 s5, 0x432fffff -; SDAG_GFX6-NEXT: v_add_f64 v[2:3], v[4:5], -v[2:3] -; SDAG_GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5] -; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG_GFX6-NEXT: v_add_f64 v[4:5], v[6:7], -v[4:5] +; SDAG_GFX6-NEXT: v_cmp_lt_f64_e32 vcc, s[4:5], v[2:3] +; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; SDAG_GFX6-NEXT: s_setpc_b64 s[30:31] ; ; SDAG_GFX7-LABEL: v_roundeven_f64: @@ -1215,9 +1217,10 @@ define double @v_roundeven_f64_fneg(double %x) { ; SDAG_GFX6-NEXT: v_mov_b32_e32 v2, 0 ; SDAG_GFX6-NEXT: v_add_f64 v[4:5], -v[0:1], v[2:3] ; SDAG_GFX6-NEXT: s_mov_b32 s4, -1 +; SDAG_GFX6-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 ; SDAG_GFX6-NEXT: s_mov_b32 s5, 0x432fffff ; SDAG_GFX6-NEXT: v_add_f64 v[2:3], v[4:5], -v[2:3] -; SDAG_GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5] +; SDAG_GFX6-NEXT: v_cmp_lt_f64_e32 vcc, s[4:5], v[0:1] ; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc ; SDAG_GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1305,20 +1308,24 @@ define <2 x double> @v_roundeven_v2f64(<2 x double> %x) { ; SDAG_GFX6: ; %bb.0: ; SDAG_GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG_GFX6-NEXT: s_brev_b32 s6, -2 -; SDAG_GFX6-NEXT: v_mov_b32_e32 v8, 0x43300000 -; SDAG_GFX6-NEXT: v_bfi_b32 v5, s6, v8, v1 +; SDAG_GFX6-NEXT: v_mov_b32_e32 v9, 0x43300000 +; SDAG_GFX6-NEXT: v_bfi_b32 v5, s6, v9, v1 ; SDAG_GFX6-NEXT: v_mov_b32_e32 v4, 0 ; SDAG_GFX6-NEXT: v_add_f64 v[6:7], v[0:1], v[4:5] ; SDAG_GFX6-NEXT: s_mov_b32 s4, -1 -; SDAG_GFX6-NEXT: s_mov_b32 s5, 0x432fffff ; SDAG_GFX6-NEXT: v_add_f64 v[5:6], v[6:7], -v[4:5] -; SDAG_GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5] +; SDAG_GFX6-NEXT: v_and_b32_e32 v8, 0x7fffffff, v1 +; SDAG_GFX6-NEXT: v_mov_b32_e32 v7, v0 +; SDAG_GFX6-NEXT: s_mov_b32 s5, 0x432fffff +; SDAG_GFX6-NEXT: v_cmp_lt_f64_e32 vcc, s[4:5], v[7:8] ; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; SDAG_GFX6-NEXT: v_bfi_b32 v5, s6, v8, v3 +; SDAG_GFX6-NEXT: v_bfi_b32 v5, s6, v9, v3 ; SDAG_GFX6-NEXT: v_add_f64 v[7:8], v[2:3], v[4:5] ; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc ; SDAG_GFX6-NEXT: v_add_f64 v[4:5], v[7:8], -v[4:5] -; SDAG_GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[2:3]|, s[4:5] +; SDAG_GFX6-NEXT: v_and_b32_e32 v7, 0x7fffffff, v3 +; SDAG_GFX6-NEXT: v_mov_b32_e32 v6, v2 +; SDAG_GFX6-NEXT: v_cmp_lt_f64_e32 vcc, s[4:5], v[6:7] ; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; SDAG_GFX6-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/select-undef.ll b/llvm/test/CodeGen/AMDGPU/select-undef.ll index ec3781fbf0fc4..f497752994852 100644 --- a/llvm/test/CodeGen/AMDGPU/select-undef.ll +++ b/llvm/test/CodeGen/AMDGPU/select-undef.ll @@ -841,3 +841,23 @@ ret: ret void } +define i64 @poison_should_freeze(i1 %cond1, i32 %val, i16 %val2, i64 %a, i64 %b) { +; GCN-LABEL: poison_should_freeze: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: v_mov_b32_e32 v7, 0x5040100 +; GCN-NEXT: v_perm_b32 v2, v2, s4, v7 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] + %poisonv = insertelement <2 x i16> poison, i16 %val2, i32 1 + %poison = bitcast <2 x i16> %poisonv to i32 + %cond2 = select i1 %cond1, i32 %poison, i32 %val + %cmp = icmp eq i32 %cond2, 0 + %select = select i1 %cmp, i64 %a, i64 %b + ret i64 %select +} diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll index 6da7d1b7ee868..a6b8ea3963b38 100644 --- a/llvm/test/CodeGen/AMDGPU/srem.ll +++ b/llvm/test/CodeGen/AMDGPU/srem.ll @@ -1819,7 +1819,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; TAHITI-NEXT: v_mul_hi_u32 v1, v0, v1 ; TAHITI-NEXT: v_mul_lo_u32 v1, v1, v2 ; TAHITI-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; TAHITI-NEXT: v_subrev_i32_e32 v1, vcc, v2, v0 +; TAHITI-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 ; TAHITI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; TAHITI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; TAHITI-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 @@ -6232,7 +6232,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_mul_hi_u32 v8, v14, v8 ; TONGA-NEXT: v_mul_lo_u32 v8, v8, v10 ; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v14, v8 -; TONGA-NEXT: v_subrev_u32_e32 v9, vcc, v10, v8 +; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v8, v10 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v8, v10 ; TONGA-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc ; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v8, v10 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index 47dfa9f4fc2d3..33c2ce628e108 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -921,45 +921,47 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[10:11], s[2:3], 31 -; GCN-NEXT: s_ashr_i64 s[6:7], s[4:5], 31 -; GCN-NEXT: s_ashr_i32 s4, s5, 31 -; GCN-NEXT: s_add_u32 s6, s6, s4 -; GCN-NEXT: s_mov_b32 s5, s4 -; GCN-NEXT: s_addc_u32 s7, s7, s4 -; GCN-NEXT: s_xor_b64 s[8:9], s[6:7], s[4:5] +; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 31 +; GCN-NEXT: s_ashr_i64 s[4:5], s[4:5], 31 +; GCN-NEXT: s_ashr_i32 s6, s5, 31 +; GCN-NEXT: s_add_u32 s4, s4, s6 +; GCN-NEXT: s_mov_b32 s7, s6 +; GCN-NEXT: s_addc_u32 s5, s5, s6 +; GCN-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7] ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GCN-NEXT: s_sub_u32 s2, 0, s8 -; GCN-NEXT: s_subb_u32 s4, 0, s9 -; GCN-NEXT: s_ashr_i32 s12, s3, 31 +; GCN-NEXT: s_sub_u32 s4, 0, s8 +; GCN-NEXT: s_subb_u32 s5, 0, s9 +; GCN-NEXT: s_ashr_i32 s10, s3, 31 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_mov_b32 s13, s12 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_add_u32 s2, s2, s10 +; GCN-NEXT: s_mov_b32 s11, s10 +; GCN-NEXT: s_addc_u32 s3, s3, s10 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-NEXT: s_xor_b64 s[12:13], s[2:3], s[10:11] +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s4, v0 -; GCN-NEXT: v_mul_lo_u32 v4, s2, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 -; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 ; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 ; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc ; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc @@ -967,12 +969,12 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 -; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 -; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s5, v0 +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_mul_lo_u32 v3, s2, v0 +; GCN-NEXT: v_mul_lo_u32 v3, s4, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 @@ -988,20 +990,18 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GCN-NEXT: s_add_u32 s2, s10, s12 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: s_addc_u32 s3, s11, s12 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GCN-NEXT: s_xor_b64 s[10:11], s[2:3], s[12:13] -; GCN-NEXT: v_mul_lo_u32 v2, s10, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 -; GCN-NEXT: v_mul_hi_u32 v4, s10, v1 -; GCN-NEXT: v_mul_hi_u32 v5, s11, v1 -; GCN-NEXT: v_mul_lo_u32 v1, s11, v1 +; GCN-NEXT: v_mul_lo_u32 v2, s12, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s12, v0 +; GCN-NEXT: v_mul_hi_u32 v4, s12, v1 +; GCN-NEXT: v_mul_hi_u32 v5, s13, v1 +; GCN-NEXT: v_mul_lo_u32 v1, s13, v1 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v4, s11, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s13, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s13, v0 +; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc @@ -1013,9 +1013,9 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: v_mul_lo_u32 v0, s8, v0 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, s11, v1 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, s13, v1 ; GCN-NEXT: v_mov_b32_e32 v3, s9 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s10, v0 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s12, v0 ; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc ; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s8, v0 ; GCN-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] @@ -1030,7 +1030,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 ; GCN-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v4, s11 +; GCN-NEXT: v_mov_b32_e32 v4, s13 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 ; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc @@ -1042,10 +1042,10 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_xor_b32_e32 v0, s12, v0 -; GCN-NEXT: v_xor_b32_e32 v1, s12, v1 -; GCN-NEXT: v_mov_b32_e32 v2, s12 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 +; GCN-NEXT: v_xor_b32_e32 v0, s10, v0 +; GCN-NEXT: v_xor_b32_e32 v1, s10, v1 +; GCN-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s10, v0 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll index f0829b53168d9..c12265bd7f372 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll @@ -3924,37 +3924,37 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[10:11], v[26:27] -; GFX7-SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], v[12:13], v[28:29] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc -; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[18:19] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[22:23] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[24:25] -; GFX7-SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], v[2:3], v[10:11] +; GFX7-SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], v[10:11], v[26:27] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[16:17] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc +; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[12:13], v[28:29] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[20:21] -; GFX7-SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], v[0:1], v[8:9] +; GFX7-SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], v[0:1], v[8:9] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[12:13] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[4:5] +; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[22:23] +; GFX7-SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], v[4:5], v[12:13] +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc +; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[18:19] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc +; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[10:11] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[6:7] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -4028,37 +4028,37 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[10:11], v[26:27] -; GFX8-SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], v[12:13], v[28:29] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc -; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[18:19] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[22:23] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[24:25] -; GFX8-SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], v[2:3], v[10:11] +; GFX8-SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], v[10:11], v[26:27] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[16:17] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc +; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[12:13], v[28:29] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[20:21] -; GFX8-SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], v[0:1], v[8:9] +; GFX8-SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], v[0:1], v[8:9] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[12:13] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[4:5] +; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[22:23] +; GFX8-SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], v[4:5], v[12:13] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc +; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[18:19] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc +; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[10:11] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[6:7] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -4132,47 +4132,49 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-SDAG-NEXT: scratch_load_dword v31, off, s32 -; GFX9-SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], v[8:9], v[24:25] -; GFX9-SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], v[0:1], v[16:17] -; GFX9-SDAG-NEXT: v_cmp_gt_i64_e64 s[8:9], v[12:13], v[28:29] -; GFX9-SDAG-NEXT: v_cmp_gt_i64_e64 s[10:11], v[4:5], v[20:21] -; GFX9-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[10:11], v[26:27] -; GFX9-SDAG-NEXT: v_cmp_gt_i64_e64 s[0:1], v[2:3], v[18:19] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v9, v25, v9, s[4:5] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[6:7] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[8:9] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[10:11] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s[4:5] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[6:7] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[8:9] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[10:11] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[0:1] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[0:1] -; GFX9-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[12:13] -; GFX9-SDAG-NEXT: v_cmp_gt_i64_e64 s[0:1], v[0:1], v[8:9] -; GFX9-SDAG-NEXT: v_cmp_gt_i64_e64 s[2:3], v[6:7], v[22:23] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[0:1] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[0:1] -; GFX9-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[2:3] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[2:3] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-SDAG-NEXT: v_cmp_gt_i64_e64 s[2:3], v[2:3], v[10:11] +; GFX9-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[24:25] +; GFX9-SDAG-NEXT: v_cmp_gt_i64_e64 s[0:1], v[0:1], v[16:17] +; GFX9-SDAG-NEXT: v_cmp_gt_i64_e64 s[2:3], v[12:13], v[28:29] +; GFX9-SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], v[4:5], v[20:21] +; GFX9-SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], v[6:7], v[22:23] +; GFX9-SDAG-NEXT: v_cmp_gt_i64_e64 s[8:9], v[10:11], v[26:27] +; GFX9-SDAG-NEXT: v_cmp_gt_i64_e64 s[10:11], v[2:3], v[18:19] +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[0:1] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[2:3] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v17, v21, v5, s[4:5] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v5, v23, v7, s[6:7] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v7, v27, v11, s[8:9] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[10:11] +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[0:1] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[2:3] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v16, v20, v4, s[4:5] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v4, v22, v6, s[6:7] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v6, v26, v10, s[8:9] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[10:11] +; GFX9-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[6:7] +; GFX9-SDAG-NEXT: v_cmp_gt_i64_e64 s[0:1], v[16:17], v[12:13] +; GFX9-SDAG-NEXT: v_cmp_gt_i64_e64 s[2:3], v[0:1], v[8:9] +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v7, v13, v17, s[0:1] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[2:3] +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v6, v12, v16, s[0:1] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[2:3] +; GFX9-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[6:7] +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[14:15], v[30:31] ; GFX9-SDAG-NEXT: s_nop 1 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v5, v31, v15, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v30, v14, vcc -; GFX9-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[4:5] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[2:3] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[2:3] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v7, v31, v15, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v6, v30, v14, vcc +; GFX9-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7] +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[4:5] ; GFX9-SDAG-NEXT: s_nop 1 ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc @@ -4242,49 +4244,49 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX10-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[8:9], v[24:25] +; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s4, v[0:1], v[16:17] +; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s5, v[12:13], v[28:29] +; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s6, v[6:7], v[22:23] +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s4 ; GFX10-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[10:11], v[26:27] ; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s4, v[2:3], v[18:19] -; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s5, v[6:7], v[22:23] -; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s6, v[8:9], v[24:25] -; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s7, v[0:1], v[16:17] -; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s8, v[12:13], v[28:29] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s5 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s5 +; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s5, v[4:5], v[20:21] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s6 ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo -; GFX10-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[20:21] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s4 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v9, v25, v9, s6 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s7 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s8 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s8 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s6 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s7 -; GFX10-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[12:13] -; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s6, v[0:1], v[8:9] -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s6 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s6 -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s4, v[14:15], v[30:31] -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v15, v31, v15, s4 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v14, v30, v14, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s5 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s5 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s6 +; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s5, v[0:1], v[8:9] ; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s4, v[2:3], v[10:11] -; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s5, v[6:7], v[14:15] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s5 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s4 -; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s4, v[0:1], v[4:5] -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s4 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s4 -; GFX10-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[2:3], v[6:7] -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s5 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[14:15], v[30:31] +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v15, v31, v15, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v14, v30, v14, vcc_lo +; GFX10-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[12:13] +; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s4, v[6:7], v[14:15] +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s4 +; GFX10-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[4:5] +; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s4, v[2:3], v[6:7] +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4 ; GFX10-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4346,50 +4348,49 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[8:9], v[24:25] +; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s0, v[0:1], v[16:17] +; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s1, v[12:13], v[28:29] +; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s2, v[6:7], v[22:23] +; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v9, v25, v9 :: v_dual_cndmask_b32 v8, v24, v8 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s0 ; GFX11-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[10:11], v[26:27] ; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s0, v[2:3], v[18:19] -; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s1, v[6:7], v[22:23] -; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s2, v[8:9], v[24:25] -; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s3, v[0:1], v[16:17] -; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s4, v[12:13], v[28:29] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s1 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s1 +; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s1, v[4:5], v[20:21] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s2 ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v11, v27, v11 :: v_dual_cndmask_b32 v10, v26, v10 -; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s5, v[4:5], v[20:21] ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v9, v25, v9, s2 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s3 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s4 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s5 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s4 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s5 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s2 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s3 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s1 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s1 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s2 +; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s1, v[0:1], v[8:9] ; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s0, v[2:3], v[10:11] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s2, v[0:1], v[8:9] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s1 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s2 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s2 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s1 ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[14:15], v[30:31] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14 ; GFX11-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[12:13] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s1, v[6:7], v[14:15] +; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s0, v[6:7], v[14:15] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s1 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s0, v[0:1], v[4:5] -; GFX11-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[2:3], v[6:7] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0 -; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2 +; GFX11-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[4:5] +; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s0, v[2:3], v[6:7] +; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX11-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -4453,58 +4454,58 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: scratch_load_b32 v31, off, s32 +; GFX12-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[8:9], v[24:25] +; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s0, v[0:1], v[16:17] +; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s1, v[12:13], v[28:29] +; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s2, v[6:7], v[22:23] +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd +; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v9, v25, v9 :: v_dual_cndmask_b32 v8, v24, v8 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s0 ; GFX12-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[10:11], v[26:27] ; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s0, v[2:3], v[18:19] -; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s1, v[6:7], v[22:23] -; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s2, v[8:9], v[24:25] -; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s3, v[0:1], v[16:17] -; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s4, v[12:13], v[28:29] +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s1 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s1 +; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s1, v[4:5], v[20:21] +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s2 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v11, v27, v11 :: v_dual_cndmask_b32 v10, v26, v10 -; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s5, v[4:5], v[20:21] ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s0 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s1 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v9, v25, v9, s2 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s3 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s4 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s5 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s0 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s1 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s4 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s5 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s2 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s3 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s1 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s1 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s2 +; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s1, v[0:1], v[8:9] ; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s0, v[2:3], v[10:11] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s2, v[0:1], v[8:9] ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s1 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s2 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s2 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s1 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[14:15], v[30:31] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14 ; GFX12-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[12:13] ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s1, v[6:7], v[14:15] +; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s0, v[6:7], v[14:15] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s1 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s1 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s0, v[0:1], v[4:5] -; GFX12-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[2:3], v[6:7] -; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0 +; GFX12-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[4:5] +; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s0, v[2:3], v[6:7] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2 +; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll index e67420562e257..5056747c33cc2 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll @@ -3924,37 +3924,37 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[26:27] -; GFX7-SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], v[12:13], v[28:29] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc -; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[18:19] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[22:23] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[24:25] -; GFX7-SDAG-NEXT: v_cmp_lt_i64_e64 s[6:7], v[2:3], v[10:11] +; GFX7-SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], v[10:11], v[26:27] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[16:17] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc +; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[12:13], v[28:29] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[20:21] -; GFX7-SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], v[0:1], v[8:9] +; GFX7-SDAG-NEXT: v_cmp_lt_i64_e64 s[6:7], v[0:1], v[8:9] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[12:13] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[4:5] +; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[22:23] +; GFX7-SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[12:13] +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc +; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[18:19] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc +; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[10:11] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[6:7] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -4028,37 +4028,37 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[26:27] -; GFX8-SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], v[12:13], v[28:29] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc -; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[18:19] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[22:23] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[24:25] -; GFX8-SDAG-NEXT: v_cmp_lt_i64_e64 s[6:7], v[2:3], v[10:11] +; GFX8-SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], v[10:11], v[26:27] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[16:17] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc +; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[12:13], v[28:29] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[20:21] -; GFX8-SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], v[0:1], v[8:9] +; GFX8-SDAG-NEXT: v_cmp_lt_i64_e64 s[6:7], v[0:1], v[8:9] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[12:13] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[4:5] +; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[22:23] +; GFX8-SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[12:13] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc +; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[18:19] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc +; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[10:11] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[6:7] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -4132,47 +4132,49 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-SDAG-NEXT: scratch_load_dword v31, off, s32 -; GFX9-SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[24:25] -; GFX9-SDAG-NEXT: v_cmp_lt_i64_e64 s[6:7], v[0:1], v[16:17] -; GFX9-SDAG-NEXT: v_cmp_lt_i64_e64 s[8:9], v[12:13], v[28:29] -; GFX9-SDAG-NEXT: v_cmp_lt_i64_e64 s[10:11], v[4:5], v[20:21] -; GFX9-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[26:27] -; GFX9-SDAG-NEXT: v_cmp_lt_i64_e64 s[0:1], v[2:3], v[18:19] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v9, v25, v9, s[4:5] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[6:7] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[8:9] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[10:11] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s[4:5] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[6:7] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[8:9] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[10:11] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[0:1] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[0:1] -; GFX9-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[12:13] -; GFX9-SDAG-NEXT: v_cmp_lt_i64_e64 s[0:1], v[0:1], v[8:9] -; GFX9-SDAG-NEXT: v_cmp_lt_i64_e64 s[2:3], v[6:7], v[22:23] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[0:1] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[0:1] -; GFX9-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[2:3] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[2:3] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-SDAG-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[10:11] +; GFX9-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[24:25] +; GFX9-SDAG-NEXT: v_cmp_lt_i64_e64 s[0:1], v[0:1], v[16:17] +; GFX9-SDAG-NEXT: v_cmp_lt_i64_e64 s[2:3], v[12:13], v[28:29] +; GFX9-SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[20:21] +; GFX9-SDAG-NEXT: v_cmp_lt_i64_e64 s[6:7], v[6:7], v[22:23] +; GFX9-SDAG-NEXT: v_cmp_lt_i64_e64 s[8:9], v[10:11], v[26:27] +; GFX9-SDAG-NEXT: v_cmp_lt_i64_e64 s[10:11], v[2:3], v[18:19] +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[0:1] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[2:3] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v17, v21, v5, s[4:5] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v5, v23, v7, s[6:7] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v7, v27, v11, s[8:9] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[10:11] +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[0:1] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[2:3] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v16, v20, v4, s[4:5] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v4, v22, v6, s[6:7] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v6, v26, v10, s[8:9] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[10:11] +; GFX9-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[6:7] +; GFX9-SDAG-NEXT: v_cmp_lt_i64_e64 s[0:1], v[16:17], v[12:13] +; GFX9-SDAG-NEXT: v_cmp_lt_i64_e64 s[2:3], v[0:1], v[8:9] +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v7, v13, v17, s[0:1] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[2:3] +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v6, v12, v16, s[0:1] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[2:3] +; GFX9-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[6:7] +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[14:15], v[30:31] ; GFX9-SDAG-NEXT: s_nop 1 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v5, v31, v15, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v30, v14, vcc -; GFX9-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[4:5] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[2:3] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[2:3] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v7, v31, v15, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v6, v30, v14, vcc +; GFX9-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7] +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[4:5] ; GFX9-SDAG-NEXT: s_nop 1 ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc @@ -4242,49 +4244,49 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX10-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[24:25] +; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s4, v[0:1], v[16:17] +; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s5, v[12:13], v[28:29] +; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s6, v[6:7], v[22:23] +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s4 ; GFX10-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[10:11], v[26:27] ; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s4, v[2:3], v[18:19] -; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s5, v[6:7], v[22:23] -; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s6, v[8:9], v[24:25] -; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s7, v[0:1], v[16:17] -; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s8, v[12:13], v[28:29] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s5 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s5 +; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s5, v[4:5], v[20:21] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s6 ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo -; GFX10-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[20:21] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s4 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v9, v25, v9, s6 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s7 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s8 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s8 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s6 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s7 -; GFX10-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[12:13] -; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s6, v[0:1], v[8:9] -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s6 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s6 -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s4, v[14:15], v[30:31] -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v15, v31, v15, s4 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v14, v30, v14, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s5 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s5 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s6 +; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s5, v[0:1], v[8:9] ; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s4, v[2:3], v[10:11] -; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s5, v[6:7], v[14:15] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s5 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s4 -; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s4, v[0:1], v[4:5] -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s4 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s4 -; GFX10-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[6:7] -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s5 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[14:15], v[30:31] +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v15, v31, v15, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v14, v30, v14, vcc_lo +; GFX10-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[12:13] +; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s4, v[6:7], v[14:15] +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s4 +; GFX10-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[4:5] +; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s4, v[2:3], v[6:7] +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4 ; GFX10-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4346,50 +4348,49 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[24:25] +; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s0, v[0:1], v[16:17] +; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s1, v[12:13], v[28:29] +; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s2, v[6:7], v[22:23] +; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v9, v25, v9 :: v_dual_cndmask_b32 v8, v24, v8 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s0 ; GFX11-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[10:11], v[26:27] ; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s0, v[2:3], v[18:19] -; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s1, v[6:7], v[22:23] -; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s2, v[8:9], v[24:25] -; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s3, v[0:1], v[16:17] -; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s4, v[12:13], v[28:29] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s1 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s1 +; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s1, v[4:5], v[20:21] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s2 ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v11, v27, v11 :: v_dual_cndmask_b32 v10, v26, v10 -; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s5, v[4:5], v[20:21] ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v9, v25, v9, s2 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s3 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s4 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s5 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s4 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s5 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s2 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s3 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s1 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s1 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s2 +; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s1, v[0:1], v[8:9] ; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s0, v[2:3], v[10:11] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s2, v[0:1], v[8:9] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s1 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s2 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s2 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s1 ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[14:15], v[30:31] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14 ; GFX11-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[12:13] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s1, v[6:7], v[14:15] +; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s0, v[6:7], v[14:15] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s1 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s0, v[0:1], v[4:5] -; GFX11-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[6:7] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0 -; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2 +; GFX11-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[4:5] +; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s0, v[2:3], v[6:7] +; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX11-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -4453,58 +4454,58 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: scratch_load_b32 v31, off, s32 +; GFX12-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[24:25] +; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s0, v[0:1], v[16:17] +; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s1, v[12:13], v[28:29] +; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s2, v[6:7], v[22:23] +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd +; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v9, v25, v9 :: v_dual_cndmask_b32 v8, v24, v8 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s0 ; GFX12-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[10:11], v[26:27] ; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s0, v[2:3], v[18:19] -; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s1, v[6:7], v[22:23] -; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s2, v[8:9], v[24:25] -; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s3, v[0:1], v[16:17] -; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s4, v[12:13], v[28:29] +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s1 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s1 +; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s1, v[4:5], v[20:21] +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s2 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v11, v27, v11 :: v_dual_cndmask_b32 v10, v26, v10 -; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s5, v[4:5], v[20:21] ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s0 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s1 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v9, v25, v9, s2 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s3 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s4 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s5 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s0 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s1 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s4 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s5 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s2 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s3 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s1 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s1 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s2 +; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s1, v[0:1], v[8:9] ; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s0, v[2:3], v[10:11] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s2, v[0:1], v[8:9] ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s1 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s2 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s2 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s1 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[14:15], v[30:31] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14 ; GFX12-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[12:13] ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s1, v[6:7], v[14:15] +; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s0, v[6:7], v[14:15] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s1 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s1 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s0, v[0:1], v[4:5] -; GFX12-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[6:7] -; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0 +; GFX12-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[4:5] +; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s0, v[2:3], v[6:7] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2 +; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll index 92993d07b4f8f..184c80765430c 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll @@ -3805,37 +3805,37 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[10:11], v[26:27] -; GFX7-SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], v[12:13], v[28:29] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc -; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[18:19] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[22:23] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[24:25] -; GFX7-SDAG-NEXT: v_cmp_gt_u64_e64 s[6:7], v[2:3], v[10:11] +; GFX7-SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], v[10:11], v[26:27] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[16:17] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc +; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[12:13], v[28:29] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[20:21] -; GFX7-SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[8:9] +; GFX7-SDAG-NEXT: v_cmp_gt_u64_e64 s[6:7], v[0:1], v[8:9] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[12:13] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[4:5] +; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[22:23] +; GFX7-SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[12:13] +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc +; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[18:19] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc +; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[10:11] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[6:7] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -3909,37 +3909,37 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[10:11], v[26:27] -; GFX8-SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], v[12:13], v[28:29] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc -; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[18:19] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[22:23] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[24:25] -; GFX8-SDAG-NEXT: v_cmp_gt_u64_e64 s[6:7], v[2:3], v[10:11] +; GFX8-SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], v[10:11], v[26:27] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[16:17] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc +; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[12:13], v[28:29] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[20:21] -; GFX8-SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[8:9] +; GFX8-SDAG-NEXT: v_cmp_gt_u64_e64 s[6:7], v[0:1], v[8:9] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[12:13] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[4:5] +; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[22:23] +; GFX8-SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[12:13] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc +; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[18:19] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc +; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[10:11] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[6:7] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -4013,47 +4013,49 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-SDAG-NEXT: scratch_load_dword v31, off, s32 -; GFX9-SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[24:25] -; GFX9-SDAG-NEXT: v_cmp_gt_u64_e64 s[6:7], v[0:1], v[16:17] -; GFX9-SDAG-NEXT: v_cmp_gt_u64_e64 s[8:9], v[12:13], v[28:29] -; GFX9-SDAG-NEXT: v_cmp_gt_u64_e64 s[10:11], v[4:5], v[20:21] -; GFX9-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[10:11], v[26:27] -; GFX9-SDAG-NEXT: v_cmp_gt_u64_e64 s[0:1], v[2:3], v[18:19] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v9, v25, v9, s[4:5] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[6:7] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[8:9] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[10:11] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s[4:5] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[6:7] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[8:9] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[10:11] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[0:1] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[0:1] -; GFX9-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[12:13] -; GFX9-SDAG-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[8:9] -; GFX9-SDAG-NEXT: v_cmp_gt_u64_e64 s[2:3], v[6:7], v[22:23] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[0:1] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[0:1] -; GFX9-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[2:3] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[2:3] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-SDAG-NEXT: v_cmp_gt_u64_e64 s[2:3], v[2:3], v[10:11] +; GFX9-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[24:25] +; GFX9-SDAG-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[16:17] +; GFX9-SDAG-NEXT: v_cmp_gt_u64_e64 s[2:3], v[12:13], v[28:29] +; GFX9-SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[20:21] +; GFX9-SDAG-NEXT: v_cmp_gt_u64_e64 s[6:7], v[6:7], v[22:23] +; GFX9-SDAG-NEXT: v_cmp_gt_u64_e64 s[8:9], v[10:11], v[26:27] +; GFX9-SDAG-NEXT: v_cmp_gt_u64_e64 s[10:11], v[2:3], v[18:19] +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[0:1] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[2:3] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v17, v21, v5, s[4:5] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v5, v23, v7, s[6:7] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v7, v27, v11, s[8:9] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[10:11] +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[0:1] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[2:3] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v16, v20, v4, s[4:5] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v4, v22, v6, s[6:7] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v6, v26, v10, s[8:9] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[10:11] +; GFX9-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[6:7] +; GFX9-SDAG-NEXT: v_cmp_gt_u64_e64 s[0:1], v[16:17], v[12:13] +; GFX9-SDAG-NEXT: v_cmp_gt_u64_e64 s[2:3], v[0:1], v[8:9] +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v7, v13, v17, s[0:1] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[2:3] +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v6, v12, v16, s[0:1] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[2:3] +; GFX9-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[14:15], v[30:31] ; GFX9-SDAG-NEXT: s_nop 1 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v5, v31, v15, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v30, v14, vcc -; GFX9-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[4:5] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[2:3] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[2:3] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v7, v31, v15, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v6, v30, v14, vcc +; GFX9-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5] ; GFX9-SDAG-NEXT: s_nop 1 ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc @@ -4123,49 +4125,49 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX10-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[8:9], v[24:25] +; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s4, v[0:1], v[16:17] +; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s5, v[12:13], v[28:29] +; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s6, v[6:7], v[22:23] +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s4 ; GFX10-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[10:11], v[26:27] ; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s4, v[2:3], v[18:19] -; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s5, v[6:7], v[22:23] -; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s6, v[8:9], v[24:25] -; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s7, v[0:1], v[16:17] -; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s8, v[12:13], v[28:29] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s5 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s5 +; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s5, v[4:5], v[20:21] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s6 ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo -; GFX10-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[20:21] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s4 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v9, v25, v9, s6 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s7 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s8 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s8 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s6 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s7 -; GFX10-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[12:13] -; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s6, v[0:1], v[8:9] -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s6 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s6 -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s4, v[14:15], v[30:31] -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v15, v31, v15, s4 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v14, v30, v14, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s5 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s5 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s6 +; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s5, v[0:1], v[8:9] ; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s4, v[2:3], v[10:11] -; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s5, v[6:7], v[14:15] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s5 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s4 -; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s4, v[0:1], v[4:5] -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s4 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s4 -; GFX10-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[6:7] -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s5 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[14:15], v[30:31] +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v15, v31, v15, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v14, v30, v14, vcc_lo +; GFX10-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[12:13] +; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s4, v[6:7], v[14:15] +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s4 +; GFX10-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[4:5] +; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s4, v[2:3], v[6:7] +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4 ; GFX10-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4227,50 +4229,49 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[8:9], v[24:25] +; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[16:17] +; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s1, v[12:13], v[28:29] +; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s2, v[6:7], v[22:23] +; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v9, v25, v9 :: v_dual_cndmask_b32 v8, v24, v8 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s0 ; GFX11-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[10:11], v[26:27] ; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[2:3], v[18:19] -; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s1, v[6:7], v[22:23] -; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s2, v[8:9], v[24:25] -; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s3, v[0:1], v[16:17] -; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s4, v[12:13], v[28:29] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s1 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s1 +; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s1, v[4:5], v[20:21] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s2 ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v11, v27, v11 :: v_dual_cndmask_b32 v10, v26, v10 -; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s5, v[4:5], v[20:21] ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v9, v25, v9, s2 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s3 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s4 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s5 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s4 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s5 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s2 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s3 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s1 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s1 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s2 +; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s1, v[0:1], v[8:9] ; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[2:3], v[10:11] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s2, v[0:1], v[8:9] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s1 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s2 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s2 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s1 ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[14:15], v[30:31] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14 ; GFX11-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[12:13] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s1, v[6:7], v[14:15] +; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[6:7], v[14:15] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s1 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[4:5] -; GFX11-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[6:7] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0 -; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2 +; GFX11-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[4:5] +; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[2:3], v[6:7] +; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX11-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -4334,58 +4335,58 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: scratch_load_b32 v31, off, s32 +; GFX12-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[8:9], v[24:25] +; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[16:17] +; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s1, v[12:13], v[28:29] +; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s2, v[6:7], v[22:23] +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd +; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v9, v25, v9 :: v_dual_cndmask_b32 v8, v24, v8 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s0 ; GFX12-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[10:11], v[26:27] ; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[2:3], v[18:19] -; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s1, v[6:7], v[22:23] -; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s2, v[8:9], v[24:25] -; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s3, v[0:1], v[16:17] -; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s4, v[12:13], v[28:29] +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s1 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s1 +; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s1, v[4:5], v[20:21] +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s2 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v11, v27, v11 :: v_dual_cndmask_b32 v10, v26, v10 -; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s5, v[4:5], v[20:21] ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s0 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s1 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v9, v25, v9, s2 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s3 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s4 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s5 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s0 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s1 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s4 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s5 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s2 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s3 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s1 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s1 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s2 +; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s1, v[0:1], v[8:9] ; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[2:3], v[10:11] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s2, v[0:1], v[8:9] ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s1 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s2 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s2 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s1 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[14:15], v[30:31] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14 ; GFX12-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[12:13] ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s1, v[6:7], v[14:15] +; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[6:7], v[14:15] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s1 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s1 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[4:5] -; GFX12-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[6:7] -; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0 +; GFX12-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[4:5] +; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[2:3], v[6:7] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2 +; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll index 2bcee373d9247..e3a7ae5fd0256 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll @@ -3544,37 +3544,37 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[26:27] -; GFX7-SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], v[12:13], v[28:29] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc -; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[18:19] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[22:23] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[24:25] -; GFX7-SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], v[2:3], v[10:11] +; GFX7-SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], v[10:11], v[26:27] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[16:17] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc +; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[28:29] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[20:21] -; GFX7-SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], v[0:1], v[8:9] +; GFX7-SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], v[0:1], v[8:9] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[4:5] +; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[22:23] +; GFX7-SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], v[4:5], v[12:13] +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc +; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[18:19] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc +; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[10:11] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[6:7] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -3648,37 +3648,37 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[26:27] -; GFX8-SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], v[12:13], v[28:29] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc -; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[18:19] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[22:23] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[24:25] -; GFX8-SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], v[2:3], v[10:11] +; GFX8-SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], v[10:11], v[26:27] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[16:17] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc +; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[28:29] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[20:21] -; GFX8-SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], v[0:1], v[8:9] +; GFX8-SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], v[0:1], v[8:9] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[4:5] +; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[22:23] +; GFX8-SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], v[4:5], v[12:13] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc +; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[18:19] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc +; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[10:11] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[6:7] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -3752,47 +3752,49 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-SDAG-NEXT: scratch_load_dword v31, off, s32 -; GFX9-SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], v[8:9], v[24:25] -; GFX9-SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], v[0:1], v[16:17] -; GFX9-SDAG-NEXT: v_cmp_lt_u64_e64 s[8:9], v[12:13], v[28:29] -; GFX9-SDAG-NEXT: v_cmp_lt_u64_e64 s[10:11], v[4:5], v[20:21] -; GFX9-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[26:27] -; GFX9-SDAG-NEXT: v_cmp_lt_u64_e64 s[0:1], v[2:3], v[18:19] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v9, v25, v9, s[4:5] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[6:7] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[8:9] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[10:11] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s[4:5] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[6:7] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[8:9] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[10:11] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[0:1] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[0:1] -; GFX9-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13] -; GFX9-SDAG-NEXT: v_cmp_lt_u64_e64 s[0:1], v[0:1], v[8:9] -; GFX9-SDAG-NEXT: v_cmp_lt_u64_e64 s[2:3], v[6:7], v[22:23] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[0:1] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[0:1] -; GFX9-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[2:3] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[2:3] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-SDAG-NEXT: v_cmp_lt_u64_e64 s[2:3], v[2:3], v[10:11] +; GFX9-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[24:25] +; GFX9-SDAG-NEXT: v_cmp_lt_u64_e64 s[0:1], v[0:1], v[16:17] +; GFX9-SDAG-NEXT: v_cmp_lt_u64_e64 s[2:3], v[12:13], v[28:29] +; GFX9-SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], v[4:5], v[20:21] +; GFX9-SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], v[6:7], v[22:23] +; GFX9-SDAG-NEXT: v_cmp_lt_u64_e64 s[8:9], v[10:11], v[26:27] +; GFX9-SDAG-NEXT: v_cmp_lt_u64_e64 s[10:11], v[2:3], v[18:19] +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[0:1] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[2:3] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v17, v21, v5, s[4:5] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v5, v23, v7, s[6:7] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v7, v27, v11, s[8:9] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[10:11] +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[0:1] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[2:3] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v16, v20, v4, s[4:5] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v4, v22, v6, s[6:7] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v6, v26, v10, s[8:9] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[10:11] +; GFX9-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[6:7] +; GFX9-SDAG-NEXT: v_cmp_lt_u64_e64 s[0:1], v[16:17], v[12:13] +; GFX9-SDAG-NEXT: v_cmp_lt_u64_e64 s[2:3], v[0:1], v[8:9] +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v7, v13, v17, s[0:1] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[2:3] +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v6, v12, v16, s[0:1] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[2:3] +; GFX9-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[14:15], v[30:31] ; GFX9-SDAG-NEXT: s_nop 1 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v5, v31, v15, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v30, v14, vcc -; GFX9-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[4:5] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[2:3] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[2:3] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v7, v31, v15, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v6, v30, v14, vcc +; GFX9-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5] ; GFX9-SDAG-NEXT: s_nop 1 ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc @@ -3862,49 +3864,49 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX10-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[24:25] +; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s4, v[0:1], v[16:17] +; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s5, v[12:13], v[28:29] +; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s6, v[6:7], v[22:23] +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s4 ; GFX10-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[10:11], v[26:27] ; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s4, v[2:3], v[18:19] -; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s5, v[6:7], v[22:23] -; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s6, v[8:9], v[24:25] -; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s7, v[0:1], v[16:17] -; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s8, v[12:13], v[28:29] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s5 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s5 +; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s5, v[4:5], v[20:21] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s6 ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo -; GFX10-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[20:21] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s4 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v9, v25, v9, s6 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s7 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s8 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s8 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s6 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s7 -; GFX10-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13] -; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s6, v[0:1], v[8:9] -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s6 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s6 -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s4, v[14:15], v[30:31] -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v15, v31, v15, s4 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v14, v30, v14, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s5 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s5 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s6 +; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s5, v[0:1], v[8:9] ; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s4, v[2:3], v[10:11] -; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s5, v[6:7], v[14:15] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s5 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s4 -; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s4, v[0:1], v[4:5] -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s4 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s4 -; GFX10-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[6:7] -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s5 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[14:15], v[30:31] +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v15, v31, v15, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v14, v30, v14, vcc_lo +; GFX10-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13] +; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s4, v[6:7], v[14:15] +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s4 +; GFX10-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5] +; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s4, v[2:3], v[6:7] +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4 ; GFX10-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -3966,50 +3968,49 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[24:25] +; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s0, v[0:1], v[16:17] +; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s1, v[12:13], v[28:29] +; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s2, v[6:7], v[22:23] +; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v9, v25, v9 :: v_dual_cndmask_b32 v8, v24, v8 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s0 ; GFX11-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[10:11], v[26:27] ; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s0, v[2:3], v[18:19] -; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s1, v[6:7], v[22:23] -; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s2, v[8:9], v[24:25] -; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s3, v[0:1], v[16:17] -; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s4, v[12:13], v[28:29] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s1 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s1 +; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s1, v[4:5], v[20:21] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s2 ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v11, v27, v11 :: v_dual_cndmask_b32 v10, v26, v10 -; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s5, v[4:5], v[20:21] ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v9, v25, v9, s2 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s3 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s4 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s5 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s4 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s5 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s2 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s3 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s1 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s1 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s2 +; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s1, v[0:1], v[8:9] ; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s0, v[2:3], v[10:11] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s2, v[0:1], v[8:9] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s1 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s2 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s2 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s1 ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[14:15], v[30:31] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14 ; GFX11-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s1, v[6:7], v[14:15] +; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s0, v[6:7], v[14:15] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s1 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s0, v[0:1], v[4:5] -; GFX11-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[6:7] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0 -; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2 +; GFX11-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5] +; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s0, v[2:3], v[6:7] +; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX11-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -4073,58 +4074,58 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: scratch_load_b32 v31, off, s32 +; GFX12-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[24:25] +; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s0, v[0:1], v[16:17] +; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s1, v[12:13], v[28:29] +; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s2, v[6:7], v[22:23] +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd +; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v9, v25, v9 :: v_dual_cndmask_b32 v8, v24, v8 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s0 ; GFX12-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[10:11], v[26:27] ; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s0, v[2:3], v[18:19] -; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s1, v[6:7], v[22:23] -; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s2, v[8:9], v[24:25] -; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s3, v[0:1], v[16:17] -; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s4, v[12:13], v[28:29] +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s1 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s1 +; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s1, v[4:5], v[20:21] +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s2 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v11, v27, v11 :: v_dual_cndmask_b32 v10, v26, v10 -; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s5, v[4:5], v[20:21] ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s0 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s1 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v9, v25, v9, s2 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s3 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s4 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s5 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s0 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s1 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s4 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s5 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s2 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s3 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s1 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s1 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s2 +; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s1, v[0:1], v[8:9] ; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s0, v[2:3], v[10:11] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s2, v[0:1], v[8:9] ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s1 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s2 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s2 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s1 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[14:15], v[30:31] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14 ; GFX12-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13] ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s1, v[6:7], v[14:15] +; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s0, v[6:7], v[14:15] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s1 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s1 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s0, v[0:1], v[4:5] -; GFX12-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[6:7] -; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0 +; GFX12-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5] +; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s0, v[2:3], v[6:7] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2 +; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1