diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h index 5d9937f832396..f27bf2ead9755 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -479,6 +479,9 @@ struct SDNodeFlags { bool operator==(const SDNodeFlags &Other) const { return Flags == Other.Flags; } + bool operator!=(const SDNodeFlags &Other) const { + return !operator==(Other); + } void operator&=(const SDNodeFlags &OtherFlags) { Flags &= OtherFlags.Flags; } void operator|=(const SDNodeFlags &OtherFlags) { Flags |= OtherFlags.Flags; } }; diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 231184587d682..a0989d372a33e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16310,6 +16310,23 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { break; } + // Use known bits to apply the nsw/nuw flags to the truncate. + const unsigned DestWidth = VT.getScalarSizeInBits(); + const unsigned SrcWidth = N0.getScalarValueSizeInBits(); + SDNodeFlags Flags = N->getFlags(); + if (!N->getFlags().hasNoSignedWrap() && + DAG.ComputeMaxSignificantBits(N0) <= DestWidth) + Flags.setNoSignedWrap(true); + if (!N->getFlags().hasNoUnsignedWrap() && + DAG.MaskedValueIsZero(N0, APInt::getBitsSetFrom(SrcWidth, DestWidth))) + Flags.setNoUnsignedWrap(true); + + if (Flags != N->getFlags()) { + N->setFlags(Flags); + AddUsersToWorklist(N); + return SDValue(N, 0); + } + return SDValue(); } diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll index 34d7ed9290b67..73a0a93b4ff38 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll @@ -23754,74 +23754,74 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s6 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v19 -; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s6 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_perm_b32 v1, v0, v2, s7 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v18 ; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc -; GFX9-NEXT: v_add3_u32 v4, v4, v0, s6 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v3, v3, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v4, v4, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v17 ; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX9-NEXT: v_add3_u32 v5, v5, v4, s6 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v17 ; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 ; GFX9-NEXT: v_add3_u32 v6, v6, v5, s6 ; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v17 -; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 -; GFX9-NEXT: v_add3_u32 v7, v7, v6, s6 -; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 -; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v7, v8, vcc -; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 -; GFX9-NEXT: v_add3_u32 v7, v7, v6, s6 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v10, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v16 -; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GFX9-NEXT: v_bfe_u32 v10, v7, 16, 1 -; GFX9-NEXT: v_add3_u32 v10, v10, v7, s6 -; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: s_mov_b32 s7, 0x7060302 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc -; GFX9-NEXT: v_perm_b32 v1, v2, v3, s7 -; GFX9-NEXT: v_perm_b32 v0, v0, v4, s7 -; GFX9-NEXT: v_perm_b32 v8, v5, v9, s7 -; GFX9-NEXT: v_perm_b32 v7, v6, v10, s7 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[7:8] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v6, v7, vcc +; GFX9-NEXT: v_perm_b32 v6, v4, v7, s7 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v8, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v16 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX9-NEXT: v_add3_u32 v8, v8, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s7 +; GFX9-NEXT: v_perm_b32 v5, v4, v8, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[5:6] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 @@ -23836,49 +23836,49 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v16i8: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v3 :: v_dual_mov_b32 v16, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v1 :: v_dual_mov_b32 v10, v0 ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[10:11] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[2:3] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v11.h +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[16:17] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[10:11] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v17.l ; GFX11-TRUE16-NEXT: .LBB108_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v17 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v1, 16, v11 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 @@ -23886,85 +23886,85 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_add3_u32 v14, v4, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v12, v4, v0, 0x7fff ; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v6, v8 :: v_dual_and_b32 v3, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v6, v8 :: v_dual_lshlrev_b32 v3, 16, v10 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v0, 16, v17 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v12, v7 :: v_dual_lshlrev_b32 v8, 16, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v14, v7, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v11 -; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v13, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.h +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v2 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v11 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v12, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v11, v3, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v8 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v10 +; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v2, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 0x40c00000, v0 :: v_dual_cndmask_b32 v0, v7, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v11, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v10, v5, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v8, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v12, 0x7fff -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v11, 0x7fff ; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v8, 0x7fff -; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v6 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v2, v13, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v3, v12, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v11, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v2, v14, v9, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v9 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v10, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v9, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v12.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v7, v11, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v5, v14 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v13, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v7, v2 -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v9, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[10:11] +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[2:3] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v14, v9, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v13, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v10, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfi_b32 v17, 0xffff, v7, v5 +; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v6, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v9, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[16:17] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[10:11] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v16 ; GFX11-TRUE16-NEXT: .LBB108_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v19.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v11.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v18.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v14.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v17.h ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v16i8: @@ -24005,37 +24005,35 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB108_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v19 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v17 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_add_f32 v1, 0x40c00000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v19 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v1, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 ; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v4 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v18 -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v12, v7, v8 :: v_dual_add_f32 v3, 0x40c00000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_add3_u32 v1, v11, v3, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v18 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_cndmask_b32 v0, v5, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v4, v11, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v7, v8 :: v_dual_and_b32 v2, 0xffff0000, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v17 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v7 ; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v2, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 @@ -24043,54 +24041,55 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_add3_u32 v9, v9, v2, 0x7fff ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v1, v4, vcc_lo -; GFX11-FAKE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v7 :: v_dual_lshlrev_b32 v1, 16, v16 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v4, v5 :: v_dual_lshlrev_b32 v4, 16, v16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v8 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v16 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v9, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v16 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add3_u32 v11, v11, v8, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v1, v1, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v9, v10, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v6, v9, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v11, v12, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-FAKE16-NEXT: v_bfe_u32 v0, v1, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v9 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v7, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v7 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-FAKE16-NEXT: v_add3_u32 v0, v0, v1, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v6, v4, 0x7060302 -; GFX11-FAKE16-NEXT: v_add3_u32 v10, v10, v7, 0x7fff -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v8 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v5, v12, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v0, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v1, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v0, v7, 0x7060302 ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v2, v3, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v8 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v7, v9, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v4 ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[7:8] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v7 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX11-FAKE16-NEXT: .LBB108_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 @@ -24356,14 +24355,14 @@ define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i3 ; GFX9-NEXT: s_cmp_lg_u32 s20, 0 ; GFX9-NEXT: s_cbranch_scc0 .LBB109_3 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_lshr_b32 s12, s19, 24 -; GFX9-NEXT: s_lshr_b32 s23, s19, 16 +; GFX9-NEXT: s_lshr_b32 s21, s19, 24 +; GFX9-NEXT: s_lshr_b32 s20, s19, 16 ; GFX9-NEXT: s_lshr_b32 s15, s19, 8 -; GFX9-NEXT: s_lshr_b32 s21, s18, 16 -; GFX9-NEXT: s_lshr_b32 s20, s18, 8 -; GFX9-NEXT: s_lshr_b32 s10, s17, 24 -; GFX9-NEXT: s_lshr_b32 s22, s17, 16 -; GFX9-NEXT: s_lshr_b32 s11, s17, 8 +; GFX9-NEXT: s_lshr_b32 s23, s18, 16 +; GFX9-NEXT: s_lshr_b32 s22, s18, 8 +; GFX9-NEXT: s_lshr_b32 s12, s17, 24 +; GFX9-NEXT: s_lshr_b32 s11, s17, 16 +; GFX9-NEXT: s_lshr_b32 s10, s17, 8 ; GFX9-NEXT: s_lshr_b32 s14, s16, 16 ; GFX9-NEXT: s_lshr_b32 s13, s16, 8 ; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 @@ -24378,20 +24377,20 @@ define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i3 ; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: s_lshl_b32 s4, s17, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v0, s4, v3 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v3 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v17 ; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 -; GFX9-NEXT: v_lshl_or_b32 v2, v6, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v2, v0, 16, v1 ; GFX9-NEXT: v_add_f32_e32 v0, s4, v3 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v0 @@ -24416,22 +24415,22 @@ define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i3 ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 ; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc ; GFX9-NEXT: s_lshl_b32 s4, s19, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v4 -; GFX9-NEXT: v_add_f32_e32 v4, s4, v3 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 -; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_add_f32_e32 v5, s4, v3 +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v16 ; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s18 -; GFX9-NEXT: v_lshl_or_b32 v10, v14, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v6, v4, 16, v5 ; GFX9-NEXT: v_add_f32_e32 v4, s4, v3 ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 @@ -24450,14 +24449,16 @@ define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i3 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v8 -; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v3 -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX9-NEXT: v_lshl_or_b32 v5, v4, 16, v3 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[5:6] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v1 @@ -24466,31 +24467,31 @@ define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i3 ; GFX9-NEXT: ; implicit-def: $sgpr13 ; GFX9-NEXT: ; implicit-def: $sgpr14 ; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr10 ; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr12 ; GFX9-NEXT: ; implicit-def: $sgpr22 -; GFX9-NEXT: ; implicit-def: $sgpr10 -; GFX9-NEXT: ; implicit-def: $sgpr20 -; GFX9-NEXT: ; implicit-def: $sgpr21 +; GFX9-NEXT: ; implicit-def: $sgpr23 ; GFX9-NEXT: ; implicit-def: $sgpr6 ; GFX9-NEXT: ; implicit-def: $sgpr15 -; GFX9-NEXT: ; implicit-def: $sgpr23 -; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr20 +; GFX9-NEXT: ; implicit-def: $sgpr21 ; GFX9-NEXT: s_branch .LBB109_2 ; GFX9-NEXT: .LBB109_4: ; GFX9-NEXT: v_mov_b32_e32 v8, s18 ; GFX9-NEXT: v_mov_b32_e32 v16, s19 -; GFX9-NEXT: v_mov_b32_e32 v14, s23 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v17, s17 -; GFX9-NEXT: v_mov_b32_e32 v6, s22 -; GFX9-NEXT: v_mov_b32_e32 v10, s21 -; GFX9-NEXT: v_mov_b32_e32 v9, s20 -; GFX9-NEXT: v_mov_b32_e32 v15, s12 +; GFX9-NEXT: v_mov_b32_e32 v10, s23 +; GFX9-NEXT: v_mov_b32_e32 v9, s22 +; GFX9-NEXT: v_mov_b32_e32 v15, s21 +; GFX9-NEXT: v_mov_b32_e32 v14, s20 ; GFX9-NEXT: v_mov_b32_e32 v13, s15 ; GFX9-NEXT: v_mov_b32_e32 v2, s14 ; GFX9-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-NEXT: v_mov_b32_e32 v7, s10 -; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-NEXT: v_mov_b32_e32 v6, s11 +; GFX9-NEXT: v_mov_b32_e32 v5, s10 ; GFX9-NEXT: v_mov_b32_e32 v11, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: .LBB109_5: ; %end @@ -24505,14 +24506,14 @@ define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i3 ; GFX11-NEXT: s_mov_b32 s8, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB109_3 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s11, s3, 24 -; GFX11-NEXT: s_lshr_b32 s18, s3, 16 +; GFX11-NEXT: s_lshr_b32 s16, s3, 24 +; GFX11-NEXT: s_lshr_b32 s15, s3, 16 ; GFX11-NEXT: s_lshr_b32 s14, s3, 8 -; GFX11-NEXT: s_lshr_b32 s16, s2, 16 -; GFX11-NEXT: s_lshr_b32 s15, s2, 8 -; GFX11-NEXT: s_lshr_b32 s9, s1, 24 -; GFX11-NEXT: s_lshr_b32 s17, s1, 16 -; GFX11-NEXT: s_lshr_b32 s10, s1, 8 +; GFX11-NEXT: s_lshr_b32 s18, s2, 16 +; GFX11-NEXT: s_lshr_b32 s17, s2, 8 +; GFX11-NEXT: s_lshr_b32 s11, s1, 24 +; GFX11-NEXT: s_lshr_b32 s10, s1, 16 +; GFX11-NEXT: s_lshr_b32 s9, s1, 8 ; GFX11-NEXT: s_lshr_b32 s13, s0, 16 ; GFX11-NEXT: s_lshr_b32 s12, s0, 8 ; GFX11-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 @@ -24535,8 +24536,8 @@ define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i3 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 ; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: s_lshl_b32 s0, s3, 16 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: s_pack_lh_b32_b16 s1, 0, s3 ; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 ; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 @@ -24544,100 +24545,101 @@ define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i3 ; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v8 :: v_dual_add_nc_u32 v7, v7, v3 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 -; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1 ; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s1 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 ; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s1 ; GFX11-NEXT: s_lshl_b32 s0, s2, 16 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo ; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s2 -; GFX11-NEXT: v_bfe_u32 v10, v7, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 -; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v7 -; GFX11-NEXT: v_bfe_u32 v12, v9, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v5, v10, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v10, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v2 ; GFX11-NEXT: v_bfe_u32 v2, v3, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v10, v12, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 -; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v12, v9, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v6 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v10, v12, v9 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-NEXT: v_bfe_u32 v8, v11, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 ; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v11 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v11 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v11 ; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v5 ; GFX11-NEXT: v_cndmask_b32_e32 v9, v10, v14, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v11 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v16 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v17 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v1 -; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v11 -; GFX11-NEXT: v_lshl_or_b32 v10, v14, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v16 +; GFX11-NEXT: v_lshl_or_b32 v2, v1, 16, v4 +; GFX11-NEXT: v_lshl_or_b32 v1, v7, 16, v10 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v10 -; GFX11-NEXT: v_lshl_or_b32 v9, v3, 16, v7 -; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2] +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-NEXT: v_lshl_or_b32 v6, v5, 16, v6 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v5, v3, 16, v9 +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v6 +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] -; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v9 ; GFX11-NEXT: s_branch .LBB109_5 ; GFX11-NEXT: .LBB109_3: ; GFX11-NEXT: ; implicit-def: $sgpr12 ; GFX11-NEXT: ; implicit-def: $sgpr13 ; GFX11-NEXT: ; implicit-def: $sgpr4 +; GFX11-NEXT: ; implicit-def: $sgpr9 ; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr11 ; GFX11-NEXT: ; implicit-def: $sgpr17 -; GFX11-NEXT: ; implicit-def: $sgpr9 -; GFX11-NEXT: ; implicit-def: $sgpr15 -; GFX11-NEXT: ; implicit-def: $sgpr16 +; GFX11-NEXT: ; implicit-def: $sgpr18 ; GFX11-NEXT: ; implicit-def: $sgpr6 ; GFX11-NEXT: ; implicit-def: $sgpr14 -; GFX11-NEXT: ; implicit-def: $sgpr18 -; GFX11-NEXT: ; implicit-def: $sgpr11 +; GFX11-NEXT: ; implicit-def: $sgpr15 +; GFX11-NEXT: ; implicit-def: $sgpr16 ; GFX11-NEXT: s_branch .LBB109_2 ; GFX11-NEXT: .LBB109_4: ; GFX11-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v17, s1 -; GFX11-NEXT: v_dual_mov_b32 v16, s3 :: v_dual_mov_b32 v9, s15 -; GFX11-NEXT: v_dual_mov_b32 v14, s18 :: v_dual_mov_b32 v15, s11 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v13, s14 -; GFX11-NEXT: v_dual_mov_b32 v6, s17 :: v_dual_mov_b32 v1, s12 -; GFX11-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v7, s9 -; GFX11-NEXT: v_dual_mov_b32 v2, s13 :: v_dual_mov_b32 v5, s10 +; GFX11-NEXT: v_dual_mov_b32 v16, s3 :: v_dual_mov_b32 v9, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v15, s16 +; GFX11-NEXT: v_dual_mov_b32 v10, s18 :: v_dual_mov_b32 v13, s14 +; GFX11-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v1, s12 +; GFX11-NEXT: v_dual_mov_b32 v2, s13 :: v_dual_mov_b32 v7, s11 +; GFX11-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v5, s9 ; GFX11-NEXT: v_mov_b32_e32 v11, s6 ; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: .LBB109_5: ; %end diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll index 5163539046bb0..cc310c994c6a7 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll @@ -7690,56 +7690,56 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s6 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v14 -; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s6 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v14 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_perm_b32 v1, v0, v2, s7 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 ; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc -; GFX9-NEXT: v_add3_u32 v4, v4, v0, s6 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v3, v3, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v13 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v4, v4, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v8 ; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX9-NEXT: v_add3_u32 v5, v5, v4, s6 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v8 ; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 ; GFX9-NEXT: v_add3_u32 v6, v6, v5, s6 ; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 -; GFX9-NEXT: v_add3_u32 v7, v7, v6, s6 -; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: s_mov_b32 s7, 0x7060302 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc -; GFX9-NEXT: v_perm_b32 v1, v2, v3, s7 -; GFX9-NEXT: v_perm_b32 v0, v0, v4, s7 -; GFX9-NEXT: v_perm_b32 v11, v5, v7, s7 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s7 +; GFX9-NEXT: v_perm_b32 v11, v4, v5, s7 ; GFX9-NEXT: v_mov_b32_e32 v12, 0x7fc07fc0 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v3 ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v11 ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 @@ -7760,7 +7760,6 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 @@ -7777,7 +7776,6 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[12:13] ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v13.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v10.l ; GFX11-TRUE16-NEXT: .LBB38_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 @@ -7798,52 +7796,51 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 ; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v0, v0, v3, 0x7fff ; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v4 ; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add3_u32 v11, v4, v1, 0x7fff ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v12 ; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v11, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v11, v9 :: v_dual_add_f32 v2, 0x40c00000, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.h +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v2, 16, 1 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v12 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v10, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v8, v13, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v3, v12, v7, 0x7fff -; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v1, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v7, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v2, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v9, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v12, v7, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v10, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v8.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v13 ; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v5, v2 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, 0x7fc07fc0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v8.h ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[12:13] +; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v6, v3 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v7, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[10:11] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v10 ; GFX11-TRUE16-NEXT: .LBB38_4: ; %end @@ -7852,7 +7849,7 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v12.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v14.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v13.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v16.l @@ -7891,70 +7888,71 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB38_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v14 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, 0x7fc07fc0 :: v_dual_lshlrev_b32 v3, 16, v13 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_add_f32 v2, 0x40c00000, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v8 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v14 -; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v2, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, 0x7fc07fc0 :: v_dual_lshlrev_b32 v5, 16, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_cndmask_b32 v0, v6, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: v_add3_u32 v1, v10, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v4 :: v_dual_cndmask_b32 v6, v7, v9 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v9, v9, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v3, 0x40c00000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v0 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v0, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v1, v4, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v4, v8, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v4, v10, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v8, v3, 0x7fff ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: v_add3_u32 v1, v1, v5, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v0, v6, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v2, v3, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v1, v11, vcc_lo -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v6, v7, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v4, v5, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v11, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v7, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4 ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v5 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v11 ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX11-FAKE16-NEXT: .LBB38_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v13 @@ -8166,13 +8164,13 @@ define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i3 ; GFX9-NEXT: s_cmp_lg_u32 s19, 0 ; GFX9-NEXT: s_cbranch_scc0 .LBB39_3 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_lshr_b32 s19, s17, 16 -; GFX9-NEXT: s_lshr_b32 s15, s18, 16 -; GFX9-NEXT: s_lshr_b32 s14, s18, 8 -; GFX9-NEXT: s_lshr_b32 s10, s17, 24 -; GFX9-NEXT: s_lshr_b32 s11, s17, 8 -; GFX9-NEXT: s_lshr_b32 s13, s16, 16 -; GFX9-NEXT: s_lshr_b32 s12, s16, 8 +; GFX9-NEXT: s_lshr_b32 s19, s18, 16 +; GFX9-NEXT: s_lshr_b32 s15, s18, 8 +; GFX9-NEXT: s_lshr_b32 s12, s17, 24 +; GFX9-NEXT: s_lshr_b32 s11, s17, 16 +; GFX9-NEXT: s_lshr_b32 s10, s17, 8 +; GFX9-NEXT: s_lshr_b32 s14, s16, 16 +; GFX9-NEXT: s_lshr_b32 s13, s16, 8 ; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 ; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB39_4 @@ -8185,20 +8183,20 @@ define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i3 ; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: s_lshl_b32 s4, s17, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v0, s4, v3 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v3 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v13 ; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 -; GFX9-NEXT: v_lshl_or_b32 v2, v6, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v2, v0, 16, v1 ; GFX9-NEXT: v_add_f32_e32 v0, s4, v3 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v0 @@ -8224,16 +8222,16 @@ define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i3 ; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 ; GFX9-NEXT: s_lshl_b32 s4, s18, 16 ; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_add_f32_e32 v3, s4, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v5, v5, v3 ; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v8 @@ -8244,32 +8242,33 @@ define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i3 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v11 ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, v13 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB39_3: -; GFX9-NEXT: ; implicit-def: $sgpr12 ; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr14 ; GFX9-NEXT: ; implicit-def: $sgpr4 -; GFX9-NEXT: ; implicit-def: $sgpr11 -; GFX9-NEXT: ; implicit-def: $sgpr19 ; GFX9-NEXT: ; implicit-def: $sgpr10 -; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr12 ; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr19 ; GFX9-NEXT: ; implicit-def: $sgpr6 ; GFX9-NEXT: s_branch .LBB39_2 ; GFX9-NEXT: .LBB39_4: ; GFX9-NEXT: v_mov_b32_e32 v8, s18 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v6, s19 -; GFX9-NEXT: v_mov_b32_e32 v10, s15 -; GFX9-NEXT: v_mov_b32_e32 v9, s14 -; GFX9-NEXT: v_mov_b32_e32 v2, s13 -; GFX9-NEXT: v_mov_b32_e32 v1, s12 -; GFX9-NEXT: v_mov_b32_e32 v7, s10 -; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v10, s19 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-NEXT: v_mov_b32_e32 v6, s11 +; GFX9-NEXT: v_mov_b32_e32 v5, s10 ; GFX9-NEXT: v_mov_b32_e32 v11, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: v_mov_b32_e32 v4, s17 @@ -8282,13 +8281,13 @@ define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i3 ; GFX11-NEXT: s_mov_b32 s3, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB39_3 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s13, s2, 16 -; GFX11-NEXT: s_lshr_b32 s12, s2, 8 -; GFX11-NEXT: s_lshr_b32 s8, s1, 24 -; GFX11-NEXT: s_lshr_b32 s14, s1, 16 -; GFX11-NEXT: s_lshr_b32 s9, s1, 8 -; GFX11-NEXT: s_lshr_b32 s11, s0, 16 -; GFX11-NEXT: s_lshr_b32 s10, s0, 8 +; GFX11-NEXT: s_lshr_b32 s14, s2, 16 +; GFX11-NEXT: s_lshr_b32 s13, s2, 8 +; GFX11-NEXT: s_lshr_b32 s10, s1, 24 +; GFX11-NEXT: s_lshr_b32 s9, s1, 16 +; GFX11-NEXT: s_lshr_b32 s8, s1, 8 +; GFX11-NEXT: s_lshr_b32 s12, s0, 16 +; GFX11-NEXT: s_lshr_b32 s11, s0, 8 ; GFX11-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 ; GFX11-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 @@ -8298,99 +8297,99 @@ define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i3 ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 ; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s3 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 -; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s0 -; GFX11-NEXT: s_lshl_b32 s0, s0, 16 ; GFX11-NEXT: s_pack_lh_b32_b16 s1, 0, s2 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s0 ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s2 ; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 ; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1 -; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 ; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s3 -; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s2 -; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 ; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 ; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 ; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s1 ; GFX11-NEXT: v_mov_b32_e32 v12, 0x7fc07fc0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v9, vcc_lo -; GFX11-NEXT: v_bfe_u32 v0, v8, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v10, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v1, v2, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfe_u32 v0, v7, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 -; GFX11-NEXT: v_add_nc_u32_e32 v3, v10, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v0, v11, vcc_lo ; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3 -; GFX11-NEXT: v_bfe_u32 v3, v7, 16, 1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v9 :: v_dual_add_nc_u32 v3, v3, v7 +; GFX11-NEXT: v_bfe_u32 v3, v6, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v6 ; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v0, v11, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v13 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v2, v9, 16, v3 ; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshl_or_b32 v11, v7, 16, v9 +; GFX11-NEXT: v_lshl_or_b32 v11, v6, 16, v7 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v11 ; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v11 ; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v1 ; GFX11-NEXT: v_mov_b32_e32 v4, v13 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB39_3: -; GFX11-NEXT: ; implicit-def: $sgpr10 ; GFX11-NEXT: ; implicit-def: $sgpr11 +; GFX11-NEXT: ; implicit-def: $sgpr12 ; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: ; implicit-def: $sgpr9 -; GFX11-NEXT: ; implicit-def: $sgpr14 ; GFX11-NEXT: ; implicit-def: $sgpr8 -; GFX11-NEXT: ; implicit-def: $sgpr12 +; GFX11-NEXT: ; implicit-def: $sgpr9 +; GFX11-NEXT: ; implicit-def: $sgpr10 ; GFX11-NEXT: ; implicit-def: $sgpr13 +; GFX11-NEXT: ; implicit-def: $sgpr14 ; GFX11-NEXT: ; implicit-def: $sgpr6 ; GFX11-NEXT: s_branch .LBB39_2 ; GFX11-NEXT: .LBB39_4: -; GFX11-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s12 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s10 -; GFX11-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s8 -; GFX11-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v5, s9 -; GFX11-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v11, s6 +; GFX11-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s13 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s11 +; GFX11-NEXT: v_dual_mov_b32 v10, s14 :: v_dual_mov_b32 v7, s10 +; GFX11-NEXT: v_dual_mov_b32 v2, s12 :: v_dual_mov_b32 v5, s8 +; GFX11-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v11, s6 ; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index b7097a9557b75..73b5069c722aa 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -2549,57 +2549,70 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s5, 0 +; GFX6-NEXT: s_mov_b32 s7, s5 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s5, s10, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX6-NEXT: s_lshr_b32 s5, s10, 16 +; GFX6-NEXT: s_and_b32 s4, s10, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GFX6-NEXT: s_and_b32 s4, s8, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 +; GFX6-NEXT: s_lshr_b32 s6, s10, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s6 ; GFX6-NEXT: s_lshr_b32 s4, s8, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 -; GFX6-NEXT: v_trunc_f32_e32 v3, v3 -; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 +; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 +; GFX6-NEXT: v_trunc_f32_e32 v2, v2 +; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: s_and_b32 s4, s11, 0xffff -; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 -; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v4 +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GFX6-NEXT: v_mad_f32 v2, -v1, v3, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 ; GFX6-NEXT: s_and_b32 s4, s9, 0xffff -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 +; GFX6-NEXT: s_flbit_i32_b32 s8, 0 +; GFX6-NEXT: s_lshr_b32 s6, s11, 16 +; GFX6-NEXT: s_min_u32 s8, s8, 32 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 +; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc ; GFX6-NEXT: v_mul_f32_e32 v1, v5, v6 +; GFX6-NEXT: s_min_u32 s6, s6, 1 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: s_lshr_b32 s4, s11, 16 -; GFX6-NEXT: v_mad_f32 v3, -v1, v4, v5 -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 ; GFX6-NEXT: s_lshr_b32 s4, s9, 16 +; GFX6-NEXT: s_or_b32 s6, s7, s6 +; GFX6-NEXT: v_mad_f32 v3, -v1, v4, v5 +; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s6 +; GFX6-NEXT: s_min_u32 s4, s4, 1 +; GFX6-NEXT: s_or_b32 s4, s5, s4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s4 -; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 +; GFX6-NEXT: s_sub_i32 s4, 32, s8 +; GFX6-NEXT: v_ldexp_f32_e64 v5, v5, s4 +; GFX6-NEXT: v_rcp_f32_e32 v7, v5 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_ldexp_f32_e64 v3, v6, s4 +; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX6-NEXT: v_mul_f32_e32 v4, v3, v7 +; GFX6-NEXT: v_trunc_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v4 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_mul_f32_e32 v3, v6, v7 -; GFX6-NEXT: v_trunc_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v3 -; GFX6-NEXT: v_mad_f32 v3, -v3, v5, v6 +; GFX6-NEXT: v_mad_f32 v3, -v4, v5, v3 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v6, vcc ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -2764,62 +2777,75 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s5, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s5, s10, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX6-NEXT: s_lshr_b32 s5, s10, 16 +; GFX6-NEXT: s_and_b32 s4, s10, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GFX6-NEXT: s_and_b32 s4, s8, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 +; GFX6-NEXT: s_lshr_b32 s6, s10, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s6 ; GFX6-NEXT: s_lshr_b32 s4, s8, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 -; GFX6-NEXT: v_trunc_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 -; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 +; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 +; GFX6-NEXT: v_trunc_f32_e32 v2, v2 +; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc -; GFX6-NEXT: v_mad_f32 v1, -v1, v2, v4 -; GFX6-NEXT: s_and_b32 s6, s11, 0xffff -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s5 -; GFX6-NEXT: s_and_b32 s5, s9, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s5 +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v1 +; GFX6-NEXT: v_mad_f32 v1, -v1, v3, v4 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v3 +; GFX6-NEXT: s_and_b32 s7, s11, 0xffff +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s7 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s6 +; GFX6-NEXT: s_and_b32 s6, s9, 0xffff +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 +; GFX6-NEXT: s_flbit_i32_b32 s8, 0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; GFX6-NEXT: s_lshr_b32 s6, s11, 16 +; GFX6-NEXT: s_mov_b32 s7, s5 +; GFX6-NEXT: s_min_u32 s8, s8, 32 +; GFX6-NEXT: s_lshl_b64 s[14:15], s[6:7], s8 +; GFX6-NEXT: s_min_u32 s7, s14, 1 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v1 -; GFX6-NEXT: s_lshr_b32 s4, s11, 16 +; GFX6-NEXT: s_lshr_b32 s4, s9, 16 +; GFX6-NEXT: s_or_b32 s7, s15, s7 ; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10 -; GFX6-NEXT: s_lshr_b32 s5, s9, 16 +; GFX6-NEXT: s_lshl_b64 s[12:13], s[4:5], s8 +; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s7 +; GFX6-NEXT: s_min_u32 s5, s12, 1 +; GFX6-NEXT: s_or_b32 s5, s13, s5 ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s5 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4 +; GFX6-NEXT: s_sub_i32 s5, 32, s8 +; GFX6-NEXT: v_ldexp_f32_e64 v4, v4, s5 +; GFX6-NEXT: v_rcp_f32_e32 v7, v4 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 ; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v3 -; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 -; GFX6-NEXT: v_mul_f32_e32 v2, v6, v7 -; GFX6-NEXT: v_trunc_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 +; GFX6-NEXT: v_ldexp_f32_e64 v2, v6, s5 +; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX6-NEXT: v_mul_f32_e32 v3, v2, v7 +; GFX6-NEXT: v_trunc_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v6 +; GFX6-NEXT: v_mad_f32 v2, -v3, v4, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v6, vcc ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s11 -; GFX6-NEXT: v_mul_lo_u32 v2, v2, s4 +; GFX6-NEXT: v_mul_lo_u32 v2, v2, s6 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s9, v1 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v2 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s4, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 @@ -10099,6 +10125,15 @@ define <2 x i64> @srem_zero_zero() { ; GCN-LABEL: kernel: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_endpgm +; GFX6-LABEL: srem_zero_zero: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: srem_zero_zero: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] entry: %B = srem <2 x i64> zeroinitializer, zeroinitializer ret <2 x i64> %B diff --git a/llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll index d0313267b56d7..ef68306e8a51a 100644 --- a/llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs --pass-remarks=si-lower \ ; RUN: %s -o - 2>&1 | FileCheck %s --check-prefix=GFX90A-HW @@ -14,6 +15,28 @@ ; GFX90A-HW: ds_add_f64 v2, v[0:1] ; GFX90A-HW: s_endpgm define amdgpu_kernel void @atomic_add_unsafe_hw(ptr addrspace(3) %ptr) #0 { +; GFX90A-HW-LABEL: atomic_add_unsafe_hw: +; GFX90A-HW: ; %bb.0: ; %main_body +; GFX90A-HW-NEXT: s_mov_b64 s[0:1], exec +; GFX90A-HW-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX90A-HW-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX90A-HW-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-HW-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX90A-HW-NEXT: s_cbranch_execz .LBB0_2 +; GFX90A-HW-NEXT: ; %bb.1: +; GFX90A-HW-NEXT: s_load_dword s2, s[4:5], 0x24 +; GFX90A-HW-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX90A-HW-NEXT: v_cvt_f64_u32_e32 v[0:1], 0 +; GFX90A-HW-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 +; GFX90A-HW-NEXT: v_cvt_f64_u32_e32 v[2:3], s0 +; GFX90A-HW-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX90A-HW-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX90A-HW-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-HW-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-HW-NEXT: ds_add_f64 v2, v[0:1] +; GFX90A-HW-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-HW-NEXT: .LBB0_2: +; GFX90A-HW-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst ret void @@ -23,6 +46,33 @@ main_body: ; GFX90A-HW: global_atomic_add_f32 v0, v1, s[2:3] ; GFX90A-HW: s_endpgm define amdgpu_kernel void @atomic_add_unsafe_hw_agent(ptr addrspace(1) %ptr, float %val) #0 { +; GFX90A-HW-LABEL: atomic_add_unsafe_hw_agent: +; GFX90A-HW: ; %bb.0: ; %main_body +; GFX90A-HW-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-HW-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-HW-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-HW-NEXT: s_mov_b32 s1, 0 +; GFX90A-HW-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-HW-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX90A-HW-NEXT: s_cbranch_execz .LBB1_2 +; GFX90A-HW-NEXT: ; %bb.1: +; GFX90A-HW-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX90A-HW-NEXT: s_flbit_i32_b32 s2, 0 +; GFX90A-HW-NEXT: s_min_u32 s2, s2, 32 +; GFX90A-HW-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; GFX90A-HW-NEXT: s_min_u32 s0, s0, 1 +; GFX90A-HW-NEXT: s_or_b32 s0, s1, s0 +; GFX90A-HW-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX90A-HW-NEXT: s_load_dword s3, s[4:5], 0x2c +; GFX90A-HW-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90A-HW-NEXT: s_sub_i32 s2, 32, s2 +; GFX90A-HW-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-HW-NEXT: v_ldexp_f32 v0, v0, s2 +; GFX90A-HW-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-HW-NEXT: v_mul_f32_e32 v0, s3, v0 +; GFX90A-HW-NEXT: global_atomic_add_f32 v1, v0, s[0:1] +; GFX90A-HW-NEXT: .LBB1_2: +; GFX90A-HW-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !0 ret void @@ -32,6 +82,33 @@ main_body: ; GFX90A-HW: global_atomic_add_f32 v0, v1, s[2:3] ; GFX90A-HW: s_endpgm define amdgpu_kernel void @atomic_add_unsafe_hw_wg(ptr addrspace(1) %ptr, float %val) #0 { +; GFX90A-HW-LABEL: atomic_add_unsafe_hw_wg: +; GFX90A-HW: ; %bb.0: ; %main_body +; GFX90A-HW-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-HW-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-HW-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-HW-NEXT: s_mov_b32 s1, 0 +; GFX90A-HW-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-HW-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX90A-HW-NEXT: s_cbranch_execz .LBB2_2 +; GFX90A-HW-NEXT: ; %bb.1: +; GFX90A-HW-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX90A-HW-NEXT: s_flbit_i32_b32 s2, 0 +; GFX90A-HW-NEXT: s_min_u32 s2, s2, 32 +; GFX90A-HW-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; GFX90A-HW-NEXT: s_min_u32 s0, s0, 1 +; GFX90A-HW-NEXT: s_or_b32 s0, s1, s0 +; GFX90A-HW-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX90A-HW-NEXT: s_load_dword s3, s[4:5], 0x2c +; GFX90A-HW-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90A-HW-NEXT: s_sub_i32 s2, 32, s2 +; GFX90A-HW-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-HW-NEXT: v_ldexp_f32 v0, v0, s2 +; GFX90A-HW-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-HW-NEXT: v_mul_f32_e32 v0, s3, v0 +; GFX90A-HW-NEXT: global_atomic_add_f32 v1, v0, s[0:1] +; GFX90A-HW-NEXT: .LBB2_2: +; GFX90A-HW-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory !0 ret void @@ -41,6 +118,33 @@ main_body: ; GFX90A-HW: global_atomic_add_f32 v0, v1, s[2:3] ; GFX90A-HW: s_endpgm define amdgpu_kernel void @atomic_add_unsafe_hw_wavefront(ptr addrspace(1) %ptr, float %val) #0 { +; GFX90A-HW-LABEL: atomic_add_unsafe_hw_wavefront: +; GFX90A-HW: ; %bb.0: ; %main_body +; GFX90A-HW-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-HW-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-HW-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-HW-NEXT: s_mov_b32 s1, 0 +; GFX90A-HW-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-HW-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX90A-HW-NEXT: s_cbranch_execz .LBB3_2 +; GFX90A-HW-NEXT: ; %bb.1: +; GFX90A-HW-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX90A-HW-NEXT: s_flbit_i32_b32 s2, 0 +; GFX90A-HW-NEXT: s_min_u32 s2, s2, 32 +; GFX90A-HW-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; GFX90A-HW-NEXT: s_min_u32 s0, s0, 1 +; GFX90A-HW-NEXT: s_or_b32 s0, s1, s0 +; GFX90A-HW-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX90A-HW-NEXT: s_load_dword s3, s[4:5], 0x2c +; GFX90A-HW-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90A-HW-NEXT: s_sub_i32 s2, 32, s2 +; GFX90A-HW-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-HW-NEXT: v_ldexp_f32 v0, v0, s2 +; GFX90A-HW-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-HW-NEXT: v_mul_f32_e32 v0, s3, v0 +; GFX90A-HW-NEXT: global_atomic_add_f32 v1, v0, s[0:1] +; GFX90A-HW-NEXT: .LBB3_2: +; GFX90A-HW-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory !0 ret void @@ -50,6 +154,33 @@ main_body: ; GFX90A-HW: global_atomic_add_f32 v0, v1, s[2:3] ; GFX90A-HW: s_endpgm define amdgpu_kernel void @atomic_add_unsafe_hw_single_thread(ptr addrspace(1) %ptr, float %val) #0 { +; GFX90A-HW-LABEL: atomic_add_unsafe_hw_single_thread: +; GFX90A-HW: ; %bb.0: ; %main_body +; GFX90A-HW-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-HW-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-HW-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-HW-NEXT: s_mov_b32 s1, 0 +; GFX90A-HW-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-HW-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX90A-HW-NEXT: s_cbranch_execz .LBB4_2 +; GFX90A-HW-NEXT: ; %bb.1: +; GFX90A-HW-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX90A-HW-NEXT: s_flbit_i32_b32 s2, 0 +; GFX90A-HW-NEXT: s_min_u32 s2, s2, 32 +; GFX90A-HW-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; GFX90A-HW-NEXT: s_min_u32 s0, s0, 1 +; GFX90A-HW-NEXT: s_or_b32 s0, s1, s0 +; GFX90A-HW-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX90A-HW-NEXT: s_load_dword s3, s[4:5], 0x2c +; GFX90A-HW-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90A-HW-NEXT: s_sub_i32 s2, 32, s2 +; GFX90A-HW-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-HW-NEXT: v_ldexp_f32 v0, v0, s2 +; GFX90A-HW-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-HW-NEXT: v_mul_f32_e32 v0, s3, v0 +; GFX90A-HW-NEXT: global_atomic_add_f32 v1, v0, s[0:1] +; GFX90A-HW-NEXT: .LBB4_2: +; GFX90A-HW-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory !0 ret void @@ -59,6 +190,33 @@ main_body: ; GFX90A-HW: global_atomic_add_f32 v0, v1, s[2:3] ; GFX90A-HW: s_endpgm define amdgpu_kernel void @atomic_add_unsafe_hw_aoa(ptr addrspace(1) %ptr, float %val) #0 { +; GFX90A-HW-LABEL: atomic_add_unsafe_hw_aoa: +; GFX90A-HW: ; %bb.0: ; %main_body +; GFX90A-HW-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-HW-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-HW-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-HW-NEXT: s_mov_b32 s1, 0 +; GFX90A-HW-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-HW-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX90A-HW-NEXT: s_cbranch_execz .LBB5_2 +; GFX90A-HW-NEXT: ; %bb.1: +; GFX90A-HW-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX90A-HW-NEXT: s_flbit_i32_b32 s2, 0 +; GFX90A-HW-NEXT: s_min_u32 s2, s2, 32 +; GFX90A-HW-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; GFX90A-HW-NEXT: s_min_u32 s0, s0, 1 +; GFX90A-HW-NEXT: s_or_b32 s0, s1, s0 +; GFX90A-HW-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX90A-HW-NEXT: s_load_dword s3, s[4:5], 0x2c +; GFX90A-HW-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90A-HW-NEXT: s_sub_i32 s2, 32, s2 +; GFX90A-HW-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-HW-NEXT: v_ldexp_f32 v0, v0, s2 +; GFX90A-HW-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-HW-NEXT: v_mul_f32_e32 v0, s3, v0 +; GFX90A-HW-NEXT: global_atomic_add_f32 v1, v0, s[0:1] +; GFX90A-HW-NEXT: .LBB5_2: +; GFX90A-HW-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent-one-as") monotonic, align 4, !amdgpu.no.fine.grained.memory !0 ret void @@ -68,6 +226,33 @@ main_body: ; GFX90A-HW: global_atomic_add_f32 v0, v1, s[2:3] ; GFX90A-HW: s_endpgm define amdgpu_kernel void @atomic_add_unsafe_hw_wgoa(ptr addrspace(1) %ptr, float %val) #0 { +; GFX90A-HW-LABEL: atomic_add_unsafe_hw_wgoa: +; GFX90A-HW: ; %bb.0: ; %main_body +; GFX90A-HW-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-HW-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-HW-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-HW-NEXT: s_mov_b32 s1, 0 +; GFX90A-HW-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-HW-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX90A-HW-NEXT: s_cbranch_execz .LBB6_2 +; GFX90A-HW-NEXT: ; %bb.1: +; GFX90A-HW-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX90A-HW-NEXT: s_flbit_i32_b32 s2, 0 +; GFX90A-HW-NEXT: s_min_u32 s2, s2, 32 +; GFX90A-HW-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; GFX90A-HW-NEXT: s_min_u32 s0, s0, 1 +; GFX90A-HW-NEXT: s_or_b32 s0, s1, s0 +; GFX90A-HW-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX90A-HW-NEXT: s_load_dword s3, s[4:5], 0x2c +; GFX90A-HW-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90A-HW-NEXT: s_sub_i32 s2, 32, s2 +; GFX90A-HW-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-HW-NEXT: v_ldexp_f32 v0, v0, s2 +; GFX90A-HW-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-HW-NEXT: v_mul_f32_e32 v0, s3, v0 +; GFX90A-HW-NEXT: global_atomic_add_f32 v1, v0, s[0:1] +; GFX90A-HW-NEXT: .LBB6_2: +; GFX90A-HW-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("workgroup-one-as") monotonic, align 4, !amdgpu.no.fine.grained.memory !0 ret void @@ -77,6 +262,33 @@ main_body: ; GFX90A-HW: global_atomic_add_f32 v0, v1, s[2:3] ; GFX90A-HW: s_endpgm define amdgpu_kernel void @atomic_add_unsafe_hw_wfoa(ptr addrspace(1) %ptr, float %val) #0 { +; GFX90A-HW-LABEL: atomic_add_unsafe_hw_wfoa: +; GFX90A-HW: ; %bb.0: ; %main_body +; GFX90A-HW-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-HW-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-HW-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-HW-NEXT: s_mov_b32 s1, 0 +; GFX90A-HW-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-HW-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX90A-HW-NEXT: s_cbranch_execz .LBB7_2 +; GFX90A-HW-NEXT: ; %bb.1: +; GFX90A-HW-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX90A-HW-NEXT: s_flbit_i32_b32 s2, 0 +; GFX90A-HW-NEXT: s_min_u32 s2, s2, 32 +; GFX90A-HW-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; GFX90A-HW-NEXT: s_min_u32 s0, s0, 1 +; GFX90A-HW-NEXT: s_or_b32 s0, s1, s0 +; GFX90A-HW-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX90A-HW-NEXT: s_load_dword s3, s[4:5], 0x2c +; GFX90A-HW-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90A-HW-NEXT: s_sub_i32 s2, 32, s2 +; GFX90A-HW-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-HW-NEXT: v_ldexp_f32 v0, v0, s2 +; GFX90A-HW-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-HW-NEXT: v_mul_f32_e32 v0, s3, v0 +; GFX90A-HW-NEXT: global_atomic_add_f32 v1, v0, s[0:1] +; GFX90A-HW-NEXT: .LBB7_2: +; GFX90A-HW-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("wavefront-one-as") monotonic, align 4, !amdgpu.no.fine.grained.memory !0 ret void @@ -86,6 +298,33 @@ main_body: ; GFX90A-HW: global_atomic_add_f32 v0, v1, s[2:3] ; GFX90A-HW: s_endpgm define amdgpu_kernel void @atomic_add_unsafe_hw_stoa(ptr addrspace(1) %ptr, float %val) #0 { +; GFX90A-HW-LABEL: atomic_add_unsafe_hw_stoa: +; GFX90A-HW: ; %bb.0: ; %main_body +; GFX90A-HW-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-HW-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-HW-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-HW-NEXT: s_mov_b32 s1, 0 +; GFX90A-HW-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-HW-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX90A-HW-NEXT: s_cbranch_execz .LBB8_2 +; GFX90A-HW-NEXT: ; %bb.1: +; GFX90A-HW-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX90A-HW-NEXT: s_flbit_i32_b32 s2, 0 +; GFX90A-HW-NEXT: s_min_u32 s2, s2, 32 +; GFX90A-HW-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; GFX90A-HW-NEXT: s_min_u32 s0, s0, 1 +; GFX90A-HW-NEXT: s_or_b32 s0, s1, s0 +; GFX90A-HW-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX90A-HW-NEXT: s_load_dword s3, s[4:5], 0x2c +; GFX90A-HW-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90A-HW-NEXT: s_sub_i32 s2, 32, s2 +; GFX90A-HW-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-HW-NEXT: v_ldexp_f32 v0, v0, s2 +; GFX90A-HW-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-HW-NEXT: v_mul_f32_e32 v0, s3, v0 +; GFX90A-HW-NEXT: global_atomic_add_f32 v1, v0, s[0:1] +; GFX90A-HW-NEXT: .LBB8_2: +; GFX90A-HW-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("singlethread-one-as") monotonic, align 4, !amdgpu.no.fine.grained.memory !0 ret void diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll index 53b2542cf9a7e..bc9ad0b2864c8 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll @@ -2150,17 +2150,17 @@ define <6 x i8> @load_v6i8(ptr addrspace(8) inreg %buf) { ; SDAG-LABEL: load_v6i8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: buffer_load_ushort v6, off, s[16:19], 0 offset:4 -; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; SDAG-NEXT: buffer_load_ushort v0, off, s[16:19], 0 offset:4 +; SDAG-NEXT: buffer_load_dword v6, off, s[16:19], 0 ; SDAG-NEXT: s_waitcnt vmcnt(1) -; SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v0 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_lshrrev_b32_e32 v7, 8, v0 -; SDAG-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; SDAG-NEXT: v_mov_b32_e32 v4, v6 -; SDAG-NEXT: v_mov_b32_e32 v1, v7 +; SDAG-NEXT: v_lshrrev_b64 v[3:4], 24, v[6:7] +; SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v6 +; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v7 +; SDAG-NEXT: v_mov_b32_e32 v0, v6 +; SDAG-NEXT: v_mov_b32_e32 v4, v7 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: load_v6i8: @@ -3610,17 +3610,17 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) { ; SDAG-LABEL: volatile_load_v6i8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0 glc -; SDAG-NEXT: buffer_load_ushort v6, off, s[16:19], 0 offset:4 glc +; SDAG-NEXT: buffer_load_dword v6, off, s[16:19], 0 glc +; SDAG-NEXT: buffer_load_ushort v0, off, s[16:19], 0 offset:4 glc ; SDAG-NEXT: s_waitcnt vmcnt(1) -; SDAG-NEXT: v_lshrrev_b32_e32 v7, 8, v0 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SDAG-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; SDAG-NEXT: v_mov_b32_e32 v4, v6 -; SDAG-NEXT: v_mov_b32_e32 v1, v7 +; SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v6 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v0 +; SDAG-NEXT: v_lshrrev_b64 v[3:4], 24, v[6:7] +; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v7 +; SDAG-NEXT: v_mov_b32_e32 v0, v6 +; SDAG-NEXT: v_mov_b32_e32 v4, v7 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: volatile_load_v6i8: diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 745e047348626..c2d9de946c742 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -2041,116 +2041,159 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v8i8_to_v8f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 -; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dwordx2 v[7:8], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: v_mov_b32_e32 v4, 0 +; SI-NEXT: buffer_load_dwordx2 v[8:9], v[3:4], s[8:11], 0 addr64 +; SI-NEXT: s_flbit_i32_b32 s0, 0 +; SI-NEXT: s_min_u32 s6, s0, 32 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_sub_i32 s7, 32, s6 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v7 -; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v7 -; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v7 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v9 +; SI-NEXT: v_lshl_b64 v[3:4], v[3:4], s6 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v7, v8 +; SI-NEXT: v_min_u32_e32 v3, 1, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_u32_e32 v3, v3 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v6, v8 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v5, v8 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v8 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v9 +; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v9 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v9 +; SI-NEXT: v_ldexp_f32_e64 v3, v3, s7 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v8i8_to_v8f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; VI-NEXT: s_flbit_i32_b32 s4, 0 +; VI-NEXT: v_mov_b32_e32 v6, 0 +; VI-NEXT: s_min_u32 s4, s4, 32 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx2 v[7:8], v[0:1] +; VI-NEXT: flat_load_dwordx2 v[3:4], v[0:1] ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_sub_i32 s5, 32, s4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v7 -; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v7 -; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v7 -; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v7 -; VI-NEXT: v_cvt_f32_ubyte3_e32 v7, v8 -; VI-NEXT: v_cvt_f32_ubyte2_e32 v6, v8 -; VI-NEXT: v_cvt_f32_ubyte1_e32 v5, v8 -; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v8 -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: v_lshrrev_b32_e32 v5, 24, v4 +; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 +; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 +; VI-NEXT: v_lshlrev_b64 v[4:5], s4, v[5:6] +; VI-NEXT: v_cvt_f32_ubyte3_e32 v7, v3 +; VI-NEXT: v_min_u32_e32 v4, 1, v4 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_cvt_f32_u32_e32 v8, v4 +; VI-NEXT: v_cvt_f32_ubyte2_e32 v6, v3 +; VI-NEXT: v_cvt_f32_ubyte1_e32 v5, v3 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v3 +; VI-NEXT: v_ldexp_f32 v3, v8, s5 +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: load_v8i8_to_v8f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-NEXT: v_mov_b32_e32 v10, 0 +; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[8:9], v0, s[2:3] +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_flbit_i32_b32 s2, 0 +; GFX10-NEXT: s_min_u32 s2, s2, 32 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v7, v9 -; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v6, v9 -; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v9 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v9 -; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v8 -; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v8 -; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v8 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v8 -; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 -; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v9 +; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v7, v8 +; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v6, v8 +; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v8 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v8 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], s2, v[10:11] +; GFX10-NEXT: s_sub_i32 s2, 32, s2 +; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v9 +; GFX10-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v9 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX10-NEXT: v_ldexp_f32 v3, v0, s2 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v9 +; GFX10-NEXT: global_store_dwordx4 v11, v[4:7], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v11, v[0:3], s[0:1] offset:16 ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v8i8_to_v8f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[7:8], v0, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] +; GFX9-NEXT: s_flbit_i32_b32 s2, 0 +; GFX9-NEXT: s_min_u32 s2, s2, 32 +; GFX9-NEXT: s_sub_i32 s3, 32, s2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v7 -; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v7 -; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v7 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v7 -; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v7, v8 -; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v6, v8 -; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v5, v8 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v4, v8 -; GFX9-NEXT: global_store_dwordx4 v9, v[4:7], s[0:1] offset:16 -; GFX9-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 +; GFX9-NEXT: v_lshlrev_b64 v[4:5], s2, v[7:8] +; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v7, v3 +; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v9, v4 +; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v6, v3 +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v5, v3 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v4, v3 +; GFX9-NEXT: v_ldexp_f32 v3, v9, s3 +; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v8i8_to_v8f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v10, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v11, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[8:9], v0, s[2:3] +; GFX11-NEXT: s_clz_i32_u32 s2, 0 +; GFX11-NEXT: s_min_u32 s2, s2, 32 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v7, v9 -; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v6, v9 -; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v5, v9 -; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v4, v9 -; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v3, v8 -; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v2, v8 -; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v8 -; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 24, v9 +; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v7, v8 +; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v6, v8 +; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v5, v8 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v4, v8 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], s2, v[10:11] +; GFX11-NEXT: s_sub_i32 s2, 32, s2 +; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v2, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX11-NEXT: v_ldexp_f32 v3, v0, s2 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v9 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v10, v[4:7], s[0:1] offset:16 -; GFX11-NEXT: global_store_b128 v10, v[0:3], s[0:1] +; GFX11-NEXT: global_store_b128 v11, v[4:7], s[0:1] +; GFX11-NEXT: global_store_b128 v11, v[0:3], s[0:1] offset:16 ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <8 x i8>, ptr addrspace(1) %in, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll index a07f1d8a02941..5f81a11b93ced 100644 --- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -490,19 +490,21 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1 ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB2_3: ; %exit -; GFX9-NEXT: v_mov_b32_e32 v0, 0x3900 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x3d00 -; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x3800 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v2, v5 src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GFX9-NEXT: v_cmp_nge_f16_e32 vcc, 0.5, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v1, v0, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3800 +; GFX9-NEXT: v_mov_b32_e32 v4, 0x3900 +; GFX9-NEXT: v_mov_b32_e32 v5, 0x3d00 ; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX9-NEXT: v_pack_b32_f16 v1, v0, v5 -; GFX9-NEXT: v_pack_b32_f16 v0, v4, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_cmp_nle_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v2, v1 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-NEXT: v_pack_b32_f16 v1, v3, v6 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 @@ -523,20 +525,20 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1 ; GFX11-TRUE16-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: .LBB2_3: ; %exit -; GFX11-TRUE16-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v2.l +; GFX11-TRUE16-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v3.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x3900 -; GFX11-TRUE16-NEXT: v_cmp_ge_f16_e64 s0, 0.5, v2.h -; GFX11-TRUE16-NEXT: v_cmp_nge_f16_e64 s1, 0.5, v3.l -; GFX11-TRUE16-NEXT: v_cmp_ge_f16_e64 s2, 0.5, v3.l +; GFX11-TRUE16-NEXT: v_cmp_ge_f16_e64 s0, 0.5, v2.l +; GFX11-TRUE16-NEXT: v_cmp_ge_f16_e64 s1, 0.5, v2.h +; GFX11-TRUE16-NEXT: v_cmp_nge_f16_e64 s2, 0.5, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, 0x3d00, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, 0x3d00, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, 0x3d00, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, 0x3d00, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x3d00, v0.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.l, 0x3d00, s2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v1.l -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v1, v2.l, v1.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, 0x3d00, s1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, 0x3d00, v0.l, s2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v1.h +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v1, v1.l, v2.l ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB2_4: ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 @@ -557,19 +559,23 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1 ; GFX11-FAKE16-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: .LBB2_3: ; %exit -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0x3d00 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX11-FAKE16-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x3900, v0, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v1 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, 0x3900 :: v_dual_cndmask_b32 v1, 0x3900, v0 -; GFX11-FAKE16-NEXT: v_cmp_nge_f16_e32 vcc_lo, 0.5, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, 0x3d00, v4, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v3 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, 0x3900, v0, vcc_lo +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, 0x3900 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0x3d00 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v3, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, 0x3900, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x3900, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x3900, v1, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cmp_nge_f16_e32 vcc_lo, 0.5, v0 ; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v2, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, 0x3d00, v5, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_pack_b32_f16 v1, v3, v4 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-FAKE16-NEXT: .LBB2_4: @@ -1213,19 +1219,21 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace( ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 ; GFX9-NEXT: .LBB5_3: ; %exit -; GFX9-NEXT: v_mov_b32_e32 v0, 0x3900 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x3d00 -; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v4 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x3800 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v4, v3 src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; GFX9-NEXT: v_cmp_nge_f16_e32 vcc, 0.5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v5, v5, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3800 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x3900 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x3d00 ; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX9-NEXT: v_pack_b32_f16 v1, v0, v4 -; GFX9-NEXT: v_pack_b32_f16 v0, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v3, vcc +; GFX9-NEXT: v_cmp_nle_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v2, vcc +; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v4, v1 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-NEXT: v_pack_b32_f16 v1, v5, v6 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB5_4: ; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 @@ -1250,20 +1258,20 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace( ; GFX11-TRUE16-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: .LBB5_3: ; %exit -; GFX11-TRUE16-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v2.l +; GFX11-TRUE16-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v3.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x3900 -; GFX11-TRUE16-NEXT: v_cmp_ge_f16_e64 s0, 0.5, v2.h -; GFX11-TRUE16-NEXT: v_cmp_nge_f16_e64 s1, 0.5, v3.l -; GFX11-TRUE16-NEXT: v_cmp_ge_f16_e64 s2, 0.5, v3.l +; GFX11-TRUE16-NEXT: v_cmp_ge_f16_e64 s0, 0.5, v2.l +; GFX11-TRUE16-NEXT: v_cmp_ge_f16_e64 s1, 0.5, v2.h +; GFX11-TRUE16-NEXT: v_cmp_nge_f16_e64 s2, 0.5, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, 0x3d00, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, 0x3d00, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, 0x3d00, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, 0x3d00, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x3d00, v0.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.l, 0x3d00, s2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v1.l -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v1, v2.l, v1.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, 0x3d00, s1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, 0x3d00, v0.l, s2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v1.h +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v1, v1.l, v2.l ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB5_4: ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 @@ -1288,19 +1296,23 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace( ; GFX11-FAKE16-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: .LBB5_3: ; %exit -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0x3d00 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX11-FAKE16-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x3900, v0, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v1 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, 0x3900 :: v_dual_cndmask_b32 v1, 0x3900, v0 -; GFX11-FAKE16-NEXT: v_cmp_nge_f16_e32 vcc_lo, 0.5, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, 0x3d00, v4, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v3 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, 0x3900, v0, vcc_lo +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, 0x3900 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0x3d00 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v3, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, 0x3900, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x3900, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x3900, v1, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cmp_nge_f16_e32 vcc_lo, 0.5, v0 ; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v2, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, 0x3d00, v5, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_pack_b32_f16 v1, v3, v4 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-FAKE16-NEXT: .LBB5_4: diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll index 63ba18a5433aa..afdcfb8dfe32b 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll @@ -189,18 +189,22 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0] ; GFX10-NEXT: v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0] ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v0, 1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0] ; GFX10-NEXT: v_cvt_f16_u16_e32 v4, v3 +; GFX10-NEXT: v_cvt_f32_u32_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_cvt_f16_u16_e32 v5, v2 -; GFX10-NEXT: v_cvt_f16_u16_e32 v6, v1 -; GFX10-NEXT: v_cvt_f16_u16_e32 v7, v0 -; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_cvt_f16_u16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_cvt_f16_u16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_cvt_f16_u16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_pack_b32_f16 v0, v7, v0 -; GFX10-NEXT: v_pack_b32_f16 v1, v6, v1 +; GFX10-NEXT: v_cvt_f32_u32_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_u32_sdwa v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_u32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f16_u16_e32 v1, v1 +; GFX10-NEXT: v_cvt_f16_u16_e32 v0, v0 +; GFX10-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX10-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-NEXT: v_pack_b32_f16 v0, v0, v6 +; GFX10-NEXT: v_pack_b32_f16 v1, v1, v7 ; GFX10-NEXT: v_pack_b32_f16 v2, v5, v2 ; GFX10-NEXT: v_pack_b32_f16 v3, v4, v3 ; GFX10-NEXT: v_pk_mul_f16 v0, 0x7000, v0 op_sel_hi:[0,1] @@ -213,30 +217,40 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v4, v2, 1 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v2, v1, 1 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v5, v0, 1 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v7, v0, 1 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.h ; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.l, v3.l -; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.h, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v1.l, v2.l -; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v1.h, v5.l -; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v2.l, v5.h -; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v2.h, v2.h -; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v3.l, v4.h -; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v3.h, v3.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v4, v1.h, v2.l -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v1, v1.l, v2.h +; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.h, v2.l +; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v1.l, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_f32_u32_e32 v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.h +; GFX11-TRUE16-NEXT: v_cvt_f32_u32_e32 v6, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h +; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v1.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v6 +; GFX11-TRUE16-NEXT: v_cvt_f32_u32_e32 v8, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v7.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.h, v8 +; GFX11-TRUE16-NEXT: v_cvt_f32_u32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v4, v0.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v3.l, v3 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v3.h, v5 +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v5, v1.l, v2.h +; GFX11-TRUE16-NEXT: v_pk_mul_f16 v2, 0x7000, v4 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v2, v0.h, v3.l +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v6, v1.h, v3.l ; GFX11-TRUE16-NEXT: v_pack_b32_f16 v3, v0.l, v3.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_pk_mul_f16 v0, 0x7000, v4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_mul_f16 v1, 0x7000, v1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_pk_mul_f16 v2, 0x7000, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_pk_mul_f16 v1, 0x7000, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_mul_f16 v0, 0x7000, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_pk_mul_f16 v3, 0x7000, v3 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -244,27 +258,31 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) { ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_lshlrev_b16 v0, v0, 1 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cvt_f16_u16_e32 v4, v3 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_cvt_f16_u16_e32 v5, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX11-FAKE16-NEXT: v_cvt_f16_u16_e32 v2, v2 +; GFX11-FAKE16-NEXT: v_cvt_f32_u32_e32 v3, v3 +; GFX11-FAKE16-NEXT: v_cvt_f32_u32_e32 v5, v5 +; GFX11-FAKE16-NEXT: v_cvt_f32_u32_e32 v6, v6 +; GFX11-FAKE16-NEXT: v_cvt_f32_u32_e32 v7, v7 ; GFX11-FAKE16-NEXT: v_cvt_f16_u16_e32 v1, v1 ; GFX11-FAKE16-NEXT: v_cvt_f16_u16_e32 v0, v0 -; GFX11-FAKE16-NEXT: v_cvt_f16_u16_e32 v6, v6 -; GFX11-FAKE16-NEXT: v_cvt_f16_u16_e32 v7, v7 -; GFX11-FAKE16-NEXT: v_cvt_f16_u16_e32 v2, v2 -; GFX11-FAKE16-NEXT: v_cvt_f16_u16_e32 v3, v3 +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v6 -; GFX11-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v7 +; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v5 +; GFX11-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v6 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_pack_b32_f16 v2, v5, v2 +; GFX11-FAKE16-NEXT: v_pack_b32_f16 v2, v2, v7 ; GFX11-FAKE16-NEXT: v_pack_b32_f16 v3, v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_pk_mul_f16 v0, 0x7000, v0 op_sel_hi:[0,1] @@ -1085,9 +1103,10 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v0, 2 op_sel_hi:[1,0] -; GFX10-NEXT: v_cvt_f16_u16_e32 v1, v0 -; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX10-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f16_u16_e32 v0, v0 +; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX10-NEXT: v_pk_mul_f16 v0, 0x4b80, v0 op_sel_hi:[0,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1095,9 +1114,13 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v0, v0, 2 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h ; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.l, v0.l -; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX11-TRUE16-NEXT: v_pk_mul_f16 v0, 0x4b80, v0 op_sel_hi:[0,1] @@ -1110,9 +1133,11 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-FAKE16-NEXT: v_cvt_f16_u16_e32 v0, v0 -; GFX11-FAKE16-NEXT: v_cvt_f16_u16_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_pk_mul_f16 v0, 0x4b80, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %shl = shl nsw nuw <2 x i16> , %cnt diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd-wrong-subtarget.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd-wrong-subtarget.ll index 6b02e6b05f1b7..f130a8cab8582 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd-wrong-subtarget.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd-wrong-subtarget.ll @@ -4,24 +4,32 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(ptr addrspace(1) %ptr) #1 { ; GCN-LABEL: global_atomic_fadd_ret_f32_wrong_subtarget: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[0:1], exec -; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GCN-NEXT: s_mov_b64 s[6:7], exec +; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: ; implicit-def: $vgpr1 ; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GCN-NEXT: s_cbranch_execz .LBB0_4 ; GCN-NEXT: ; %bb.1: ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GCN-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GCN-NEXT: s_flbit_i32_b32 s6, 0 +; GCN-NEXT: s_min_u32 s6, s6, 32 +; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], s6 +; GCN-NEXT: s_min_u32 s0, s0, 1 +; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 +; GCN-NEXT: s_sub_i32 s1, 32, s6 ; GCN-NEXT: s_mov_b64 s[6:7], 0 +; GCN-NEXT: v_ldexp_f32 v1, v1, s1 ; GCN-NEXT: v_mul_f32_e32 v2, 4.0, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dword s8, s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s8 ; GCN-NEXT: .LBB0_2: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_mov_b32_e32 v5, v1 @@ -52,20 +60,28 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(ptr addrsp define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(ptr addrspace(1) %ptr) #1 { ; GCN-LABEL: global_atomic_fadd_noret_f32_wrong_subtarget: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[0:1], exec -; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GCN-NEXT: s_mov_b64 s[2:3], exec +; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_cbranch_execz .LBB1_2 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GCN-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mul_f32_e32 v1, 4.0, v1 +; GCN-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GCN-NEXT: s_flbit_i32_b32 s2, 0 +; GCN-NEXT: s_min_u32 s2, s2, 32 +; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; GCN-NEXT: s_min_u32 s0, s0, 1 +; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN-NEXT: s_sub_i32 s2, 32, s2 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_ldexp_f32 v0, v0, s2 +; GCN-NEXT: v_mul_f32_e32 v0, 4.0, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_atomic_add_f32 v0, v1, s[2:3] +; GCN-NEXT: global_atomic_add_f32 v1, v0, s[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_wbinvl1 ; GCN-NEXT: .LBB1_2: diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll index bd9aa0f5a454a..f31de6dbcab72 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -filetype=obj < %s | llvm-objdump --triple=amdgcn--amdhsa --mcpu=gfx803 -d - | FileCheck -check-prefix=DISASSEMBLY-VI %s @@ -11,20 +12,28 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(ptr addrspace(1) %ptr) #0 { ; GCN-LABEL: global_atomic_fadd_noret_f32_wrong_subtarget: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[0:1], exec -; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GCN-NEXT: s_mov_b64 s[2:3], exec +; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_cbranch_execz .LBB0_2 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GCN-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mul_f32_e32 v1, 4.0, v1 +; GCN-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GCN-NEXT: s_flbit_i32_b32 s2, 0 +; GCN-NEXT: s_min_u32 s2, s2, 32 +; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; GCN-NEXT: s_min_u32 s0, s0, 1 +; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: s_sub_i32 s2, 32, s2 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_ldexp_f32 v0, v0, s2 +; GCN-NEXT: v_mul_f32_e32 v0, 4.0, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_atomic_add_f32 v0, v1, s[2:3] +; GCN-NEXT: global_atomic_add_f32 v1, v0, s[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_wbinvl1_vol ; GCN-NEXT: .LBB0_2: diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index 37756d15861be..3c1830cf9e163 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -19,6 +19,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: s_mov_b32 s7, 0 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -26,15 +27,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_flbit_i32_b32 s2, 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX7LESS-NEXT: s_min_u32 s2, s2, 32 +; GFX7LESS-NEXT: s_lshl_b64 s[6:7], s[6:7], s2 +; GFX7LESS-NEXT: s_sub_i32 s2, 32, s2 +; GFX7LESS-NEXT: s_min_u32 s6, s6, 1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s8, s[0:1], 0x0 +; GFX7LESS-NEXT: s_or_b32 s6, s7, s6 +; GFX7LESS-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX7LESS-NEXT: v_ldexp_f32_e64 v0, v0, s2 ; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s8 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -54,23 +62,31 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-NEXT: s_flbit_i32_b32 s4, 0 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX9-NEXT: s_min_u32 s4, s4, 32 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX9-NEXT: s_min_u32 s2, s2, 1 +; GFX9-NEXT: s_or_b32 s2, s3, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NEXT: s_sub_i32 s2, 32, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_ldexp_f32 v0, v0, s2 +; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 @@ -86,23 +102,31 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB0_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_flbit_i32_b32 s4, 0 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX1064-NEXT: s_min_u32 s4, s4, 32 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX1064-NEXT: s_min_u32 s2, s2, 1 +; GFX1064-NEXT: s_or_b32 s2, s3, s2 +; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX1064-NEXT: s_sub_i32 s2, 32, s4 +; GFX1064-NEXT: v_ldexp_f32 v0, v0, s2 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-NEXT: v_mov_b32_e32 v1, s5 ; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 @@ -150,17 +174,27 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b32 s3, 0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1164-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB0_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 -; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX1164-NEXT: s_clz_i32_u32 s6, 0 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[0:1] +; GFX1164-NEXT: s_min_u32 s6, s6, 32 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 +; GFX1164-NEXT: s_lshl_b64 s[0:1], s[2:3], s6 +; GFX1164-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX1164-NEXT: s_min_u32 s0, s0, 1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_or_b32 s0, s1, s0 +; GFX1164-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX1164-NEXT: s_sub_i32 s0, 32, s6 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_ldexp_f32 v0, v0, s0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mul_f32_e32 v0, 4.0, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -190,6 +224,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -197,15 +232,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-DPP-NEXT: s_flbit_i32_b32 s2, 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX7LESS-DPP-NEXT: s_min_u32 s2, s2, 32 +; GFX7LESS-DPP-NEXT: s_lshl_b64 s[6:7], s[6:7], s2 +; GFX7LESS-DPP-NEXT: s_sub_i32 s2, 32, s2 +; GFX7LESS-DPP-NEXT: s_min_u32 s6, s6, 1 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s8, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_or_b32 s6, s7, s6 +; GFX7LESS-DPP-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX7LESS-DPP-NEXT: v_ldexp_f32_e64 v0, v0, s2 ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s8 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -225,23 +267,31 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX9-DPP-NEXT: s_mov_b32 s3, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] -; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-DPP-NEXT: s_flbit_i32_b32 s4, 0 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX9-DPP-NEXT: s_min_u32 s4, s4, 32 +; GFX9-DPP-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX9-DPP-NEXT: s_min_u32 s2, s2, 1 +; GFX9-DPP-NEXT: s_or_b32 s2, s3, s2 +; GFX9-DPP-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_sub_i32 s2, 32, s4 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_ldexp_f32 v0, v0, s2 +; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -257,23 +307,31 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_flbit_i32_b32 s4, 0 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX1064-DPP-NEXT: s_min_u32 s4, s4, 32 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-DPP-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX1064-DPP-NEXT: s_min_u32 s2, s2, 1 +; GFX1064-DPP-NEXT: s_or_b32 s2, s3, s2 +; GFX1064-DPP-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX1064-DPP-NEXT: s_sub_i32 s2, 32, s4 +; GFX1064-DPP-NEXT: v_ldexp_f32 v0, v0, s2 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s5 ; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -321,17 +379,27 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b32 s3, 0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX1164-DPP-NEXT: s_clz_i32_u32 s6, 0 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[0:1] +; GFX1164-DPP-NEXT: s_min_u32 s6, s6, 32 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 +; GFX1164-DPP-NEXT: s_lshl_b64 s[0:1], s[2:3], s6 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX1164-DPP-NEXT: s_min_u32 s0, s0, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_or_b32 s0, s1, s0 +; GFX1164-DPP-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX1164-DPP-NEXT: s_sub_i32 s0, 32, s6 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_ldexp_f32 v0, v0, s0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mul_f32_e32 v0, 4.0, v0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -5745,11 +5813,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[0:1], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 +; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 +; GFX7LESS-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX7LESS-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6 @@ -5776,24 +5847,27 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], 0 +; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: v_cvt_f64_u32_e32 v[2:3], s0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX9-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -5810,24 +5884,27 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], 0 +; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX1064-NEXT: v_cvt_f64_u32_e32 v[2:3], s0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX1064-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 +; GFX1064-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064-NEXT: v_mov_b32_e32 v3, s3 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX1064-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -5885,18 +5962,22 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], 0 ; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1164-NEXT: v_cvt_f64_u32_e32 v[2:3], s0 ; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 +; GFX1164-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-NEXT: v_mov_b32_e32 v3, s3 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX1164-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5959,11 +6040,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 +; GFX7LESS-DPP-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 +; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX7LESS-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s6 @@ -5990,24 +6074,27 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], 0 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[2:3], s0 ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX9-DPP-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -6024,24 +6111,27 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], 0 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[2:3], s0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064-DPP-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX1064-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -6099,18 +6189,22 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], 0 ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[2:3], s0 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -12298,6 +12392,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: s_mov_b32 s7, 0 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -12305,15 +12400,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: s_cbranch_execz .LBB18_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_flbit_i32_b32 s2, 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX7LESS-NEXT: s_min_u32 s2, s2, 32 +; GFX7LESS-NEXT: s_lshl_b64 s[6:7], s[6:7], s2 +; GFX7LESS-NEXT: s_sub_i32 s2, 32, s2 +; GFX7LESS-NEXT: s_min_u32 s6, s6, 1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s8, s[0:1], 0x0 +; GFX7LESS-NEXT: s_or_b32 s6, s7, s6 +; GFX7LESS-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX7LESS-NEXT: v_ldexp_f32_e64 v0, v0, s2 ; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s8 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -12333,23 +12435,31 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB18_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-NEXT: s_flbit_i32_b32 s4, 0 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX9-NEXT: s_min_u32 s4, s4, 32 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX9-NEXT: s_min_u32 s2, s2, 1 +; GFX9-NEXT: s_or_b32 s2, s3, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NEXT: s_sub_i32 s2, 32, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_ldexp_f32 v0, v0, s2 +; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 @@ -12365,23 +12475,31 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB18_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_flbit_i32_b32 s4, 0 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX1064-NEXT: s_min_u32 s4, s4, 32 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX1064-NEXT: s_min_u32 s2, s2, 1 +; GFX1064-NEXT: s_or_b32 s2, s3, s2 +; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX1064-NEXT: s_sub_i32 s2, 32, s4 +; GFX1064-NEXT: v_ldexp_f32 v0, v0, s2 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-NEXT: v_mov_b32_e32 v1, s5 ; GFX1064-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 @@ -12429,17 +12547,27 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b32 s3, 0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1164-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB18_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 -; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX1164-NEXT: s_clz_i32_u32 s6, 0 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[0:1] +; GFX1164-NEXT: s_min_u32 s6, s6, 32 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 +; GFX1164-NEXT: s_lshl_b64 s[0:1], s[2:3], s6 +; GFX1164-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX1164-NEXT: s_min_u32 s0, s0, 1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_or_b32 s0, s1, s0 +; GFX1164-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX1164-NEXT: s_sub_i32 s0, 32, s6 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_ldexp_f32 v0, v0, s0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mul_f32_e32 v0, 4.0, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -12469,6 +12597,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -12476,15 +12605,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB18_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-DPP-NEXT: s_flbit_i32_b32 s2, 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX7LESS-DPP-NEXT: s_min_u32 s2, s2, 32 +; GFX7LESS-DPP-NEXT: s_lshl_b64 s[6:7], s[6:7], s2 +; GFX7LESS-DPP-NEXT: s_sub_i32 s2, 32, s2 +; GFX7LESS-DPP-NEXT: s_min_u32 s6, s6, 1 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s8, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_or_b32 s6, s7, s6 +; GFX7LESS-DPP-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX7LESS-DPP-NEXT: v_ldexp_f32_e64 v0, v0, s2 ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s8 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -12504,23 +12640,31 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX9-DPP-NEXT: s_mov_b32 s3, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB18_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] -; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-DPP-NEXT: s_flbit_i32_b32 s4, 0 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX9-DPP-NEXT: s_min_u32 s4, s4, 32 +; GFX9-DPP-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX9-DPP-NEXT: s_min_u32 s2, s2, 1 +; GFX9-DPP-NEXT: s_or_b32 s2, s3, s2 +; GFX9-DPP-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_sub_i32 s2, 32, s4 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_ldexp_f32 v0, v0, s2 +; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-DPP-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -12536,23 +12680,31 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB18_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_flbit_i32_b32 s4, 0 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX1064-DPP-NEXT: s_min_u32 s4, s4, 32 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-DPP-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX1064-DPP-NEXT: s_min_u32 s2, s2, 1 +; GFX1064-DPP-NEXT: s_or_b32 s2, s3, s2 +; GFX1064-DPP-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX1064-DPP-NEXT: s_sub_i32 s2, 32, s4 +; GFX1064-DPP-NEXT: v_ldexp_f32 v0, v0, s2 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s5 ; GFX1064-DPP-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -12600,17 +12752,27 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b32 s3, 0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB18_2 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX1164-DPP-NEXT: s_clz_i32_u32 s6, 0 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[0:1] +; GFX1164-DPP-NEXT: s_min_u32 s6, s6, 32 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 +; GFX1164-DPP-NEXT: s_lshl_b64 s[0:1], s[2:3], s6 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX1164-DPP-NEXT: s_min_u32 s0, s0, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_or_b32 s0, s1, s0 +; GFX1164-DPP-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX1164-DPP-NEXT: s_sub_i32 s0, 32, s6 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_ldexp_f32 v0, v0, s0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mul_f32_e32 v0, 4.0, v0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -12644,6 +12806,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: s_mov_b32 s7, 0 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -12651,15 +12814,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: s_cbranch_execz .LBB19_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_flbit_i32_b32 s2, 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX7LESS-NEXT: s_min_u32 s2, s2, 32 +; GFX7LESS-NEXT: s_lshl_b64 s[6:7], s[6:7], s2 +; GFX7LESS-NEXT: s_sub_i32 s2, 32, s2 +; GFX7LESS-NEXT: s_min_u32 s6, s6, 1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s8, s[0:1], 0x0 +; GFX7LESS-NEXT: s_or_b32 s6, s7, s6 +; GFX7LESS-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX7LESS-NEXT: v_ldexp_f32_e64 v0, v0, s2 ; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s8 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -12679,23 +12849,31 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB19_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-NEXT: s_flbit_i32_b32 s4, 0 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX9-NEXT: s_min_u32 s4, s4, 32 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX9-NEXT: s_min_u32 s2, s2, 1 +; GFX9-NEXT: s_or_b32 s2, s3, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NEXT: s_sub_i32 s2, 32, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_ldexp_f32 v0, v0, s2 +; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 @@ -12711,23 +12889,31 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB19_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_flbit_i32_b32 s4, 0 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX1064-NEXT: s_min_u32 s4, s4, 32 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX1064-NEXT: s_min_u32 s2, s2, 1 +; GFX1064-NEXT: s_or_b32 s2, s3, s2 +; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX1064-NEXT: s_sub_i32 s2, 32, s4 +; GFX1064-NEXT: v_ldexp_f32 v0, v0, s2 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-NEXT: v_mov_b32_e32 v1, s5 ; GFX1064-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 @@ -12775,17 +12961,27 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b32 s3, 0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1164-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB19_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 -; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX1164-NEXT: s_clz_i32_u32 s6, 0 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[0:1] +; GFX1164-NEXT: s_min_u32 s6, s6, 32 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 +; GFX1164-NEXT: s_lshl_b64 s[0:1], s[2:3], s6 +; GFX1164-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX1164-NEXT: s_min_u32 s0, s0, 1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_or_b32 s0, s1, s0 +; GFX1164-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX1164-NEXT: s_sub_i32 s0, 32, s6 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_ldexp_f32 v0, v0, s0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mul_f32_e32 v0, 4.0, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -12815,6 +13011,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -12822,15 +13019,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB19_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-DPP-NEXT: s_flbit_i32_b32 s2, 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX7LESS-DPP-NEXT: s_min_u32 s2, s2, 32 +; GFX7LESS-DPP-NEXT: s_lshl_b64 s[6:7], s[6:7], s2 +; GFX7LESS-DPP-NEXT: s_sub_i32 s2, 32, s2 +; GFX7LESS-DPP-NEXT: s_min_u32 s6, s6, 1 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s8, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_or_b32 s6, s7, s6 +; GFX7LESS-DPP-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX7LESS-DPP-NEXT: v_ldexp_f32_e64 v0, v0, s2 ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s8 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -12850,23 +13054,31 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX9-DPP-NEXT: s_mov_b32 s3, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB19_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] -; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-DPP-NEXT: s_flbit_i32_b32 s4, 0 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX9-DPP-NEXT: s_min_u32 s4, s4, 32 +; GFX9-DPP-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX9-DPP-NEXT: s_min_u32 s2, s2, 1 +; GFX9-DPP-NEXT: s_or_b32 s2, s3, s2 +; GFX9-DPP-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_sub_i32 s2, 32, s4 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_ldexp_f32 v0, v0, s2 +; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-DPP-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -12882,23 +13094,31 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB19_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_flbit_i32_b32 s4, 0 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX1064-DPP-NEXT: s_min_u32 s4, s4, 32 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-DPP-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX1064-DPP-NEXT: s_min_u32 s2, s2, 1 +; GFX1064-DPP-NEXT: s_or_b32 s2, s3, s2 +; GFX1064-DPP-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX1064-DPP-NEXT: s_sub_i32 s2, 32, s4 +; GFX1064-DPP-NEXT: v_ldexp_f32 v0, v0, s2 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s5 ; GFX1064-DPP-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -12946,17 +13166,27 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b32 s3, 0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB19_2 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX1164-DPP-NEXT: s_clz_i32_u32 s6, 0 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[0:1] +; GFX1164-DPP-NEXT: s_min_u32 s6, s6, 32 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 +; GFX1164-DPP-NEXT: s_lshl_b64 s[0:1], s[2:3], s6 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX1164-DPP-NEXT: s_min_u32 s0, s0, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_or_b32 s0, s1, s0 +; GFX1164-DPP-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX1164-DPP-NEXT: s_sub_i32 s0, 32, s6 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_ldexp_f32 v0, v0, s0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mul_f32_e32 v0, 4.0, v0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index bb119ebd11daa..b2102e977b185 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -19,6 +19,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: s_mov_b32 s7, 0 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -26,15 +27,22 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_flbit_i32_b32 s2, 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX7LESS-NEXT: s_min_u32 s2, s2, 32 +; GFX7LESS-NEXT: s_lshl_b64 s[6:7], s[6:7], s2 +; GFX7LESS-NEXT: s_sub_i32 s2, 32, s2 +; GFX7LESS-NEXT: s_min_u32 s6, s6, 1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s8, s[0:1], 0x0 +; GFX7LESS-NEXT: s_or_b32 s6, s7, s6 +; GFX7LESS-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX7LESS-NEXT: v_ldexp_f32_e64 v0, v0, s2 ; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s8 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -54,23 +62,31 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-NEXT: s_flbit_i32_b32 s4, 0 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX9-NEXT: s_min_u32 s4, s4, 32 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX9-NEXT: s_min_u32 s2, s2, 1 +; GFX9-NEXT: s_or_b32 s2, s3, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NEXT: s_sub_i32 s2, 32, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_ldexp_f32 v0, v0, s2 +; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -86,23 +102,31 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB0_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_flbit_i32_b32 s4, 0 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX1064-NEXT: s_min_u32 s4, s4, 32 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX1064-NEXT: s_min_u32 s2, s2, 1 +; GFX1064-NEXT: s_or_b32 s2, s3, s2 +; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX1064-NEXT: s_sub_i32 s2, 32, s4 +; GFX1064-NEXT: v_ldexp_f32 v0, v0, s2 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-NEXT: v_mov_b32_e32 v1, s5 ; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -149,25 +173,36 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[6:7], exec +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB0_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-NEXT: s_clz_i32_u32 s4, 0 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX1164-NEXT: s_min_u32 s4, s4, 32 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s2, s2, 1 +; GFX1164-NEXT: s_or_b32 s2, s3, s2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX1164-NEXT: s_sub_i32 s2, 32, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_ldexp_f32 v0, v0, s2 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s5, s[0:1], 0x0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164-NEXT: v_mov_b32_e32 v1, s5 ; GFX1164-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -220,6 +255,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -227,15 +263,22 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-DPP-NEXT: s_flbit_i32_b32 s2, 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX7LESS-DPP-NEXT: s_min_u32 s2, s2, 32 +; GFX7LESS-DPP-NEXT: s_lshl_b64 s[6:7], s[6:7], s2 +; GFX7LESS-DPP-NEXT: s_sub_i32 s2, 32, s2 +; GFX7LESS-DPP-NEXT: s_min_u32 s6, s6, 1 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s8, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_or_b32 s6, s7, s6 +; GFX7LESS-DPP-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX7LESS-DPP-NEXT: v_ldexp_f32_e64 v0, v0, s2 ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s8 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -255,23 +298,31 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX9-DPP-NEXT: s_mov_b32 s3, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] -; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-DPP-NEXT: s_flbit_i32_b32 s4, 0 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX9-DPP-NEXT: s_min_u32 s4, s4, 32 +; GFX9-DPP-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX9-DPP-NEXT: s_min_u32 s2, s2, 1 +; GFX9-DPP-NEXT: s_or_b32 s2, s3, s2 +; GFX9-DPP-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_sub_i32 s2, 32, s4 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_ldexp_f32 v0, v0, s2 +; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -287,23 +338,31 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_flbit_i32_b32 s4, 0 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX1064-DPP-NEXT: s_min_u32 s4, s4, 32 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-DPP-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX1064-DPP-NEXT: s_min_u32 s2, s2, 1 +; GFX1064-DPP-NEXT: s_or_b32 s2, s3, s2 +; GFX1064-DPP-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX1064-DPP-NEXT: s_sub_i32 s2, 32, s4 +; GFX1064-DPP-NEXT: v_ldexp_f32 v0, v0, s2 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s5 ; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -350,25 +409,36 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], exec +; GFX1164-DPP-NEXT: s_mov_b32 s3, 0 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-DPP-NEXT: s_clz_i32_u32 s4, 0 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX1164-DPP-NEXT: s_min_u32 s4, s4, 32 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-DPP-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_min_u32 s2, s2, 1 +; GFX1164-DPP-NEXT: s_or_b32 s2, s3, s2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX1164-DPP-NEXT: s_sub_i32 s2, 32, s4 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_ldexp_f32 v0, v0, s2 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s5, s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s5 ; GFX1164-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -6073,11 +6143,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[0:1], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 +; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 +; GFX7LESS-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX7LESS-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6 @@ -6104,24 +6177,27 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], 0 +; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: v_cvt_f64_u32_e32 v[2:3], s0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX9-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -6138,24 +6214,27 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], 0 +; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX1064-NEXT: v_cvt_f64_u32_e32 v[2:3], s0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX1064-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 +; GFX1064-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064-NEXT: v_mov_b32_e32 v3, s3 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX1064-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -6213,18 +6292,22 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], 0 ; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1164-NEXT: v_cvt_f64_u32_e32 v[2:3], s0 ; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 +; GFX1164-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-NEXT: v_mov_b32_e32 v3, s3 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX1164-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -6287,11 +6370,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 +; GFX7LESS-DPP-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 +; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX7LESS-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s6 @@ -6318,24 +6404,27 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], 0 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[2:3], s0 ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX9-DPP-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -6352,24 +6441,27 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], 0 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[2:3], s0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064-DPP-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX1064-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -6427,18 +6519,22 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], 0 ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[2:3], s0 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index a42c71c4849bd..d2810633cafc1 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -8324,27 +8324,35 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX942-LABEL: local_ds_fadd: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX942-NEXT: s_mov_b64 s[0:1], exec -; GFX942-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX942-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX942-NEXT: s_mov_b64 s[6:7], exec +; GFX942-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX942-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX942-NEXT: s_mov_b32 s9, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_add_i32 s3, s3, 4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX942-NEXT: s_flbit_i32_b32 s10, 0 ; GFX942-NEXT: ; implicit-def: $vgpr1 -; GFX942-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_cbranch_execz .LBB28_2 ; GFX942-NEXT: ; %bb.1: -; GFX942-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX942-NEXT: s_lshl_b32 s8, s3, 3 -; GFX942-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX942-NEXT: s_bcnt1_i32_b64 s8, s[6:7] +; GFX942-NEXT: s_min_u32 s11, s10, 32 +; GFX942-NEXT: s_lshl_b64 s[6:7], s[8:9], s11 +; GFX942-NEXT: s_min_u32 s6, s6, 1 +; GFX942-NEXT: s_or_b32 s6, s7, s6 +; GFX942-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX942-NEXT: s_sub_i32 s7, 32, s11 +; GFX942-NEXT: s_lshl_b32 s6, s3, 3 +; GFX942-NEXT: v_mov_b32_e32 v2, s6 +; GFX942-NEXT: v_ldexp_f32 v1, v1, s7 ; GFX942-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX942-NEXT: v_mov_b32_e32 v2, s8 ; GFX942-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: .LBB28_2: -; GFX942-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_mov_b64 s[8:9], exec -; GFX942-NEXT: v_readfirstlane_b32 s10, v1 +; GFX942-NEXT: v_readfirstlane_b32 s11, v1 ; GFX942-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 ; GFX942-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 ; GFX942-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 @@ -8352,7 +8360,14 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX942-NEXT: s_cbranch_execz .LBB28_4 ; GFX942-NEXT: ; %bb.3: ; GFX942-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX942-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX942-NEXT: s_mov_b32 s1, 0 +; GFX942-NEXT: s_min_u32 s8, s10, 32 +; GFX942-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 +; GFX942-NEXT: s_min_u32 s0, s0, 1 +; GFX942-NEXT: s_or_b32 s0, s1, s0 +; GFX942-NEXT: v_cvt_f32_u32_e32 v1, s0 +; GFX942-NEXT: s_sub_i32 s0, 32, s8 +; GFX942-NEXT: v_ldexp_f32 v1, v1, s0 ; GFX942-NEXT: s_lshl_b32 s0, s3, 4 ; GFX942-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX942-NEXT: v_mov_b32_e32 v2, s0 @@ -8362,8 +8377,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX942-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX942-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX942-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 -; GFX942-NEXT: v_add_f32_e32 v0, s10, v0 -; GFX942-NEXT: v_mov_b32_e32 v1, s10 +; GFX942-NEXT: v_add_f32_e32 v0, s11, v0 +; GFX942-NEXT: v_mov_b32_e32 v1, s11 ; GFX942-NEXT: s_mov_b64 s[0:1], exec ; GFX942-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX942-NEXT: v_bfrev_b32_e32 v1, 1 @@ -8582,27 +8597,35 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX90A-LABEL: local_ds_fadd: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_mov_b64 s[6:7], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX90A-NEXT: s_mov_b32 s9, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_add_i32 s3, s3, 4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_flbit_i32_b32 s10, 0 ; GFX90A-NEXT: ; implicit-def: $vgpr1 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB28_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: s_lshl_b32 s8, s3, 3 -; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX90A-NEXT: s_bcnt1_i32_b64 s8, s[6:7] +; GFX90A-NEXT: s_min_u32 s11, s10, 32 +; GFX90A-NEXT: s_lshl_b64 s[6:7], s[8:9], s11 +; GFX90A-NEXT: s_min_u32 s6, s6, 1 +; GFX90A-NEXT: s_or_b32 s6, s7, s6 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX90A-NEXT: s_sub_i32 s7, 32, s11 +; GFX90A-NEXT: s_lshl_b32 s6, s3, 3 +; GFX90A-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NEXT: v_ldexp_f32 v1, v1, s7 ; GFX90A-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: .LBB28_2: -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX90A-NEXT: s_mov_b64 s[8:9], exec -; GFX90A-NEXT: v_readfirstlane_b32 s10, v1 +; GFX90A-NEXT: v_readfirstlane_b32 s11, v1 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 ; GFX90A-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 @@ -8610,7 +8633,14 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX90A-NEXT: s_cbranch_execz .LBB28_4 ; GFX90A-NEXT: ; %bb.3: ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX90A-NEXT: s_mov_b32 s1, 0 +; GFX90A-NEXT: s_min_u32 s8, s10, 32 +; GFX90A-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 +; GFX90A-NEXT: s_min_u32 s0, s0, 1 +; GFX90A-NEXT: s_or_b32 s0, s1, s0 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s0 +; GFX90A-NEXT: s_sub_i32 s0, 32, s8 +; GFX90A-NEXT: v_ldexp_f32 v1, v1, s0 ; GFX90A-NEXT: s_lshl_b32 s0, s3, 4 ; GFX90A-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, s0 @@ -8620,8 +8650,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 -; GFX90A-NEXT: v_add_f32_e32 v0, s10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v1, s10 +; GFX90A-NEXT: v_add_f32_e32 v0, s11, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s11 ; GFX90A-NEXT: s_mov_b64 s[0:1], exec ; GFX90A-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX90A-NEXT: v_bfrev_b32_e32 v1, 1 @@ -8665,27 +8695,35 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX908-LABEL: local_ds_fadd: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX908-NEXT: s_mov_b64 s[0:1], exec -; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX908-NEXT: s_mov_b64 s[6:7], exec +; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX908-NEXT: s_mov_b32 s9, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_add_i32 s3, s3, 4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX908-NEXT: s_flbit_i32_b32 s10, 0 ; GFX908-NEXT: ; implicit-def: $vgpr1 -; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX908-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX908-NEXT: s_cbranch_execz .LBB28_2 ; GFX908-NEXT: ; %bb.1: -; GFX908-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX908-NEXT: s_lshl_b32 s8, s3, 3 -; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX908-NEXT: s_bcnt1_i32_b64 s8, s[6:7] +; GFX908-NEXT: s_min_u32 s11, s10, 32 +; GFX908-NEXT: s_lshl_b64 s[6:7], s[8:9], s11 +; GFX908-NEXT: s_min_u32 s6, s6, 1 +; GFX908-NEXT: s_or_b32 s6, s7, s6 +; GFX908-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX908-NEXT: s_sub_i32 s7, 32, s11 +; GFX908-NEXT: s_lshl_b32 s6, s3, 3 +; GFX908-NEXT: v_mov_b32_e32 v2, s6 +; GFX908-NEXT: v_ldexp_f32 v1, v1, s7 ; GFX908-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX908-NEXT: v_mov_b32_e32 v2, s8 ; GFX908-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: .LBB28_2: -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX908-NEXT: s_mov_b64 s[8:9], exec -; GFX908-NEXT: v_readfirstlane_b32 s10, v1 +; GFX908-NEXT: v_readfirstlane_b32 s11, v1 ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 ; GFX908-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 @@ -8693,7 +8731,14 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX908-NEXT: s_cbranch_execz .LBB28_4 ; GFX908-NEXT: ; %bb.3: ; GFX908-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX908-NEXT: s_mov_b32 s1, 0 +; GFX908-NEXT: s_min_u32 s8, s10, 32 +; GFX908-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 +; GFX908-NEXT: s_min_u32 s0, s0, 1 +; GFX908-NEXT: s_or_b32 s0, s1, s0 +; GFX908-NEXT: v_cvt_f32_u32_e32 v1, s0 +; GFX908-NEXT: s_sub_i32 s0, 32, s8 +; GFX908-NEXT: v_ldexp_f32 v1, v1, s0 ; GFX908-NEXT: s_lshl_b32 s0, s3, 4 ; GFX908-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX908-NEXT: v_mov_b32_e32 v2, s0 @@ -8703,8 +8748,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX908-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 -; GFX908-NEXT: v_add_f32_e32 v0, s10, v0 -; GFX908-NEXT: v_mov_b32_e32 v1, s10 +; GFX908-NEXT: v_add_f32_e32 v0, s11, v0 +; GFX908-NEXT: v_mov_b32_e32 v1, s11 ; GFX908-NEXT: s_mov_b64 s[0:1], exec ; GFX908-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX908-NEXT: v_bfrev_b32_e32 v1, 1 @@ -8748,28 +8793,36 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX8-LABEL: local_ds_fadd: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX8-NEXT: s_mov_b32 s9, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_i32 s3, s3, 4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_flbit_i32_b32 s10, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB28_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX8-NEXT: s_lshl_b32 s8, s3, 3 -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7] +; GFX8-NEXT: s_min_u32 s11, s10, 32 +; GFX8-NEXT: s_lshl_b64 s[6:7], s[8:9], s11 +; GFX8-NEXT: s_min_u32 s6, s6, 1 +; GFX8-NEXT: s_or_b32 s6, s7, s6 +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX8-NEXT: s_sub_i32 s7, 32, s11 +; GFX8-NEXT: s_lshl_b32 s6, s3, 3 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_ldexp_f32 v1, v1, s7 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB28_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_mov_b64 s[8:9], exec -; GFX8-NEXT: v_readfirstlane_b32 s10, v1 +; GFX8-NEXT: v_readfirstlane_b32 s11, v1 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 @@ -8777,7 +8830,14 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX8-NEXT: s_cbranch_execz .LBB28_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX8-NEXT: s_mov_b32 s1, 0 +; GFX8-NEXT: s_min_u32 s8, s10, 32 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 +; GFX8-NEXT: s_min_u32 s0, s0, 1 +; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s0 +; GFX8-NEXT: s_sub_i32 s0, 32, s8 +; GFX8-NEXT: v_ldexp_f32 v1, v1, s0 ; GFX8-NEXT: s_lshl_b32 s0, s3, 4 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 @@ -8787,8 +8847,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 -; GFX8-NEXT: v_add_f32_e32 v0, s10, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, s10 +; GFX8-NEXT: v_add_f32_e32 v0, s11, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s11 ; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 @@ -8837,19 +8897,27 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: s_mov_b64 s[0:1], exec ; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: s_mov_b32 s9, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_i32 s3, s3, 4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: s_flbit_i32_b32 s12, 0 ; GFX7-NEXT: ; implicit-def: $vgpr1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_cbranch_execz .LBB28_4 ; GFX7-NEXT: ; %bb.1: -; GFX7-NEXT: s_lshl_b32 s8, s3, 3 -; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: s_bcnt1_i32_b64 s8, s[0:1] +; GFX7-NEXT: s_min_u32 s11, s12, 32 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[8:9], s11 +; GFX7-NEXT: s_min_u32 s0, s0, 1 +; GFX7-NEXT: s_lshl_b32 s10, s3, 3 +; GFX7-NEXT: s_or_b32 s0, s1, s0 +; GFX7-NEXT: v_cvt_f32_u32_e32 v3, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s10 ; GFX7-NEXT: ds_read_b32 v1, v2 -; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 +; GFX7-NEXT: s_sub_i32 s0, 32, s11 +; GFX7-NEXT: v_ldexp_f32_e64 v3, v3, s0 ; GFX7-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB28_2: ; %atomicrmw.start @@ -8867,19 +8935,26 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: .LBB28_4: ; %Flow23 ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_mov_b64 s[8:9], exec -; GFX7-NEXT: v_readfirstlane_b32 s10, v1 -; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s8, 0 -; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s9, v1 +; GFX7-NEXT: s_mov_b64 s[10:11], exec +; GFX7-NEXT: v_readfirstlane_b32 s13, v1 +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s10, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s11, v1 +; GFX7-NEXT: s_mov_b32 s9, 0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 ; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] ; GFX7-NEXT: s_cbranch_execz .LBB28_7 ; GFX7-NEXT: ; %bb.5: +; GFX7-NEXT: s_bcnt1_i32_b64 s8, s[10:11] +; GFX7-NEXT: s_min_u32 s10, s12, 32 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[8:9], s10 +; GFX7-NEXT: s_min_u32 s0, s0, 1 +; GFX7-NEXT: s_or_b32 s0, s1, s0 +; GFX7-NEXT: v_cvt_f32_u32_e32 v2, s0 ; GFX7-NEXT: s_lshl_b32 s0, s3, 4 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: ds_read_b32 v3, v1 -; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 +; GFX7-NEXT: s_sub_i32 s0, 32, s10 +; GFX7-NEXT: v_ldexp_f32_e64 v2, v2, s0 ; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB28_6: ; %atomicrmw.start2 @@ -8897,8 +8972,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 -; GFX7-NEXT: v_add_f32_e32 v0, s10, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s10 +; GFX7-NEXT: v_add_f32_e32 v0, s13, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-NEXT: s_mov_b64 s[0:1], exec ; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX7-NEXT: v_bfrev_b32_e32 v1, 1 @@ -8961,19 +9036,27 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_mov_b32 s9, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_add_i32 s3, s3, 4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_flbit_i32_b32 s12, 0 ; GFX6-NEXT: ; implicit-def: $vgpr1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_cbranch_execz .LBB28_4 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_lshl_b32 s8, s3, 3 -; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: s_bcnt1_i32_b64 s8, s[0:1] +; GFX6-NEXT: s_min_u32 s11, s12, 32 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[8:9], s11 +; GFX6-NEXT: s_min_u32 s0, s0, 1 +; GFX6-NEXT: s_lshl_b32 s10, s3, 3 +; GFX6-NEXT: s_or_b32 s0, s1, s0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s0 +; GFX6-NEXT: v_mov_b32_e32 v2, s10 ; GFX6-NEXT: ds_read_b32 v1, v2 -; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 +; GFX6-NEXT: s_sub_i32 s0, 32, s11 +; GFX6-NEXT: v_ldexp_f32_e64 v3, v3, s0 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB28_2: ; %atomicrmw.start @@ -8991,19 +9074,26 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: .LBB28_4: ; %Flow21 ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_mov_b64 s[8:9], exec -; GFX6-NEXT: v_readfirstlane_b32 s10, v1 -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s8, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s9, v1 +; GFX6-NEXT: s_mov_b64 s[10:11], exec +; GFX6-NEXT: v_readfirstlane_b32 s13, v1 +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s10, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s11, v1 +; GFX6-NEXT: s_mov_b32 s9, 0 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 ; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] ; GFX6-NEXT: s_cbranch_execz .LBB28_7 ; GFX6-NEXT: ; %bb.5: +; GFX6-NEXT: s_bcnt1_i32_b64 s8, s[10:11] +; GFX6-NEXT: s_min_u32 s10, s12, 32 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[8:9], s10 +; GFX6-NEXT: s_min_u32 s0, s0, 1 +; GFX6-NEXT: s_or_b32 s0, s1, s0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s0 ; GFX6-NEXT: s_lshl_b32 s0, s3, 4 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_read_b32 v3, v1 -; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 +; GFX6-NEXT: s_sub_i32 s0, 32, s10 +; GFX6-NEXT: v_ldexp_f32_e64 v2, v2, s0 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB28_6: ; %atomicrmw.start2 @@ -9021,8 +9111,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 -; GFX6-NEXT: v_add_f32_e32 v0, s10, v0 -; GFX6-NEXT: v_mov_b32_e32 v1, s10 +; GFX6-NEXT: v_add_f32_e32 v0, s13, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s13 ; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 @@ -9186,27 +9276,35 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX942-LABEL: local_ds_fadd_one_as: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX942-NEXT: s_mov_b64 s[0:1], exec -; GFX942-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX942-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX942-NEXT: s_mov_b64 s[6:7], exec +; GFX942-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX942-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX942-NEXT: s_mov_b32 s9, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_add_i32 s3, s3, 4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX942-NEXT: s_flbit_i32_b32 s10, 0 ; GFX942-NEXT: ; implicit-def: $vgpr1 -; GFX942-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_cbranch_execz .LBB29_2 ; GFX942-NEXT: ; %bb.1: -; GFX942-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX942-NEXT: s_lshl_b32 s8, s3, 3 -; GFX942-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX942-NEXT: s_bcnt1_i32_b64 s8, s[6:7] +; GFX942-NEXT: s_min_u32 s11, s10, 32 +; GFX942-NEXT: s_lshl_b64 s[6:7], s[8:9], s11 +; GFX942-NEXT: s_min_u32 s6, s6, 1 +; GFX942-NEXT: s_or_b32 s6, s7, s6 +; GFX942-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX942-NEXT: s_sub_i32 s7, 32, s11 +; GFX942-NEXT: s_lshl_b32 s6, s3, 3 +; GFX942-NEXT: v_mov_b32_e32 v2, s6 +; GFX942-NEXT: v_ldexp_f32 v1, v1, s7 ; GFX942-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX942-NEXT: v_mov_b32_e32 v2, s8 ; GFX942-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX942-NEXT: .LBB29_2: -; GFX942-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_mov_b64 s[8:9], exec ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_readfirstlane_b32 s10, v1 +; GFX942-NEXT: v_readfirstlane_b32 s11, v1 ; GFX942-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 ; GFX942-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 ; GFX942-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 @@ -9214,7 +9312,14 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX942-NEXT: s_cbranch_execz .LBB29_4 ; GFX942-NEXT: ; %bb.3: ; GFX942-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX942-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX942-NEXT: s_mov_b32 s1, 0 +; GFX942-NEXT: s_min_u32 s8, s10, 32 +; GFX942-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 +; GFX942-NEXT: s_min_u32 s0, s0, 1 +; GFX942-NEXT: s_or_b32 s0, s1, s0 +; GFX942-NEXT: v_cvt_f32_u32_e32 v1, s0 +; GFX942-NEXT: s_sub_i32 s0, 32, s8 +; GFX942-NEXT: v_ldexp_f32 v1, v1, s0 ; GFX942-NEXT: s_lshl_b32 s0, s3, 4 ; GFX942-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX942-NEXT: v_mov_b32_e32 v2, s0 @@ -9223,8 +9328,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX942-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX942-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX942-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 -; GFX942-NEXT: v_add_f32_e32 v0, s10, v0 -; GFX942-NEXT: v_mov_b32_e32 v1, s10 +; GFX942-NEXT: v_add_f32_e32 v0, s11, v0 +; GFX942-NEXT: v_mov_b32_e32 v1, s11 ; GFX942-NEXT: s_mov_b64 s[0:1], exec ; GFX942-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX942-NEXT: v_bfrev_b32_e32 v1, 1 @@ -9427,27 +9532,35 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX90A-LABEL: local_ds_fadd_one_as: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_mov_b64 s[6:7], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX90A-NEXT: s_mov_b32 s9, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_add_i32 s3, s3, 4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_flbit_i32_b32 s10, 0 ; GFX90A-NEXT: ; implicit-def: $vgpr1 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB29_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: s_lshl_b32 s8, s3, 3 -; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX90A-NEXT: s_bcnt1_i32_b64 s8, s[6:7] +; GFX90A-NEXT: s_min_u32 s11, s10, 32 +; GFX90A-NEXT: s_lshl_b64 s[6:7], s[8:9], s11 +; GFX90A-NEXT: s_min_u32 s6, s6, 1 +; GFX90A-NEXT: s_or_b32 s6, s7, s6 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX90A-NEXT: s_sub_i32 s7, 32, s11 +; GFX90A-NEXT: s_lshl_b32 s6, s3, 3 +; GFX90A-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NEXT: v_ldexp_f32 v1, v1, s7 ; GFX90A-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX90A-NEXT: .LBB29_2: -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX90A-NEXT: s_mov_b64 s[8:9], exec ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_readfirstlane_b32 s10, v1 +; GFX90A-NEXT: v_readfirstlane_b32 s11, v1 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 ; GFX90A-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 @@ -9455,7 +9568,14 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: s_cbranch_execz .LBB29_4 ; GFX90A-NEXT: ; %bb.3: ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX90A-NEXT: s_mov_b32 s1, 0 +; GFX90A-NEXT: s_min_u32 s8, s10, 32 +; GFX90A-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 +; GFX90A-NEXT: s_min_u32 s0, s0, 1 +; GFX90A-NEXT: s_or_b32 s0, s1, s0 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s0 +; GFX90A-NEXT: s_sub_i32 s0, 32, s8 +; GFX90A-NEXT: v_ldexp_f32 v1, v1, s0 ; GFX90A-NEXT: s_lshl_b32 s0, s3, 4 ; GFX90A-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, s0 @@ -9464,8 +9584,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 -; GFX90A-NEXT: v_add_f32_e32 v0, s10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v1, s10 +; GFX90A-NEXT: v_add_f32_e32 v0, s11, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s11 ; GFX90A-NEXT: s_mov_b64 s[0:1], exec ; GFX90A-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX90A-NEXT: v_bfrev_b32_e32 v1, 1 @@ -9507,27 +9627,35 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX908-LABEL: local_ds_fadd_one_as: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX908-NEXT: s_mov_b64 s[0:1], exec -; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX908-NEXT: s_mov_b64 s[6:7], exec +; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX908-NEXT: s_mov_b32 s9, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_add_i32 s3, s3, 4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX908-NEXT: s_flbit_i32_b32 s10, 0 ; GFX908-NEXT: ; implicit-def: $vgpr1 -; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX908-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX908-NEXT: s_cbranch_execz .LBB29_2 ; GFX908-NEXT: ; %bb.1: -; GFX908-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX908-NEXT: s_lshl_b32 s8, s3, 3 -; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX908-NEXT: s_bcnt1_i32_b64 s8, s[6:7] +; GFX908-NEXT: s_min_u32 s11, s10, 32 +; GFX908-NEXT: s_lshl_b64 s[6:7], s[8:9], s11 +; GFX908-NEXT: s_min_u32 s6, s6, 1 +; GFX908-NEXT: s_or_b32 s6, s7, s6 +; GFX908-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX908-NEXT: s_sub_i32 s7, 32, s11 +; GFX908-NEXT: s_lshl_b32 s6, s3, 3 +; GFX908-NEXT: v_mov_b32_e32 v2, s6 +; GFX908-NEXT: v_ldexp_f32 v1, v1, s7 ; GFX908-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX908-NEXT: v_mov_b32_e32 v2, s8 ; GFX908-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX908-NEXT: .LBB29_2: -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX908-NEXT: s_mov_b64 s[8:9], exec ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_readfirstlane_b32 s10, v1 +; GFX908-NEXT: v_readfirstlane_b32 s11, v1 ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 ; GFX908-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 @@ -9535,7 +9663,14 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX908-NEXT: s_cbranch_execz .LBB29_4 ; GFX908-NEXT: ; %bb.3: ; GFX908-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX908-NEXT: s_mov_b32 s1, 0 +; GFX908-NEXT: s_min_u32 s8, s10, 32 +; GFX908-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 +; GFX908-NEXT: s_min_u32 s0, s0, 1 +; GFX908-NEXT: s_or_b32 s0, s1, s0 +; GFX908-NEXT: v_cvt_f32_u32_e32 v1, s0 +; GFX908-NEXT: s_sub_i32 s0, 32, s8 +; GFX908-NEXT: v_ldexp_f32 v1, v1, s0 ; GFX908-NEXT: s_lshl_b32 s0, s3, 4 ; GFX908-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX908-NEXT: v_mov_b32_e32 v2, s0 @@ -9544,8 +9679,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX908-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 -; GFX908-NEXT: v_add_f32_e32 v0, s10, v0 -; GFX908-NEXT: v_mov_b32_e32 v1, s10 +; GFX908-NEXT: v_add_f32_e32 v0, s11, v0 +; GFX908-NEXT: v_mov_b32_e32 v1, s11 ; GFX908-NEXT: s_mov_b64 s[0:1], exec ; GFX908-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX908-NEXT: v_bfrev_b32_e32 v1, 1 @@ -9587,28 +9722,36 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX8-LABEL: local_ds_fadd_one_as: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX8-NEXT: s_mov_b32 s9, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_i32 s3, s3, 4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_flbit_i32_b32 s10, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB29_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX8-NEXT: s_lshl_b32 s8, s3, 3 -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7] +; GFX8-NEXT: s_min_u32 s11, s10, 32 +; GFX8-NEXT: s_lshl_b64 s[6:7], s[8:9], s11 +; GFX8-NEXT: s_min_u32 s6, s6, 1 +; GFX8-NEXT: s_or_b32 s6, s7, s6 +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX8-NEXT: s_sub_i32 s7, 32, s11 +; GFX8-NEXT: s_lshl_b32 s6, s3, 3 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_ldexp_f32 v1, v1, s7 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX8-NEXT: .LBB29_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_mov_b64 s[8:9], exec ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s10, v1 +; GFX8-NEXT: v_readfirstlane_b32 s11, v1 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 @@ -9616,7 +9759,14 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: s_cbranch_execz .LBB29_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX8-NEXT: s_mov_b32 s1, 0 +; GFX8-NEXT: s_min_u32 s8, s10, 32 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 +; GFX8-NEXT: s_min_u32 s0, s0, 1 +; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s0 +; GFX8-NEXT: s_sub_i32 s0, 32, s8 +; GFX8-NEXT: v_ldexp_f32 v1, v1, s0 ; GFX8-NEXT: s_lshl_b32 s0, s3, 4 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 @@ -9625,8 +9775,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 -; GFX8-NEXT: v_add_f32_e32 v0, s10, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, s10 +; GFX8-NEXT: v_add_f32_e32 v0, s11, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s11 ; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 @@ -9673,19 +9823,27 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_mov_b64 s[0:1], exec ; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: s_mov_b32 s9, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_i32 s3, s3, 4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: s_flbit_i32_b32 s12, 0 ; GFX7-NEXT: ; implicit-def: $vgpr1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_cbranch_execz .LBB29_4 ; GFX7-NEXT: ; %bb.1: -; GFX7-NEXT: s_lshl_b32 s8, s3, 3 -; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: s_bcnt1_i32_b64 s8, s[0:1] +; GFX7-NEXT: s_min_u32 s11, s12, 32 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[8:9], s11 +; GFX7-NEXT: s_min_u32 s0, s0, 1 +; GFX7-NEXT: s_lshl_b32 s10, s3, 3 +; GFX7-NEXT: s_or_b32 s0, s1, s0 +; GFX7-NEXT: v_cvt_f32_u32_e32 v3, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s10 ; GFX7-NEXT: ds_read_b32 v1, v2 -; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 +; GFX7-NEXT: s_sub_i32 s0, 32, s11 +; GFX7-NEXT: v_ldexp_f32_e64 v3, v3, s0 ; GFX7-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB29_2: ; %atomicrmw.start @@ -9703,19 +9861,26 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: .LBB29_4: ; %Flow23 ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_mov_b64 s[8:9], exec -; GFX7-NEXT: v_readfirstlane_b32 s10, v1 -; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s8, 0 -; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s9, v1 +; GFX7-NEXT: s_mov_b64 s[10:11], exec +; GFX7-NEXT: v_readfirstlane_b32 s13, v1 +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s10, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s11, v1 +; GFX7-NEXT: s_mov_b32 s9, 0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 ; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] ; GFX7-NEXT: s_cbranch_execz .LBB29_7 ; GFX7-NEXT: ; %bb.5: +; GFX7-NEXT: s_bcnt1_i32_b64 s8, s[10:11] +; GFX7-NEXT: s_min_u32 s10, s12, 32 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[8:9], s10 +; GFX7-NEXT: s_min_u32 s0, s0, 1 +; GFX7-NEXT: s_or_b32 s0, s1, s0 +; GFX7-NEXT: v_cvt_f32_u32_e32 v2, s0 ; GFX7-NEXT: s_lshl_b32 s0, s3, 4 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: ds_read_b32 v3, v1 -; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 +; GFX7-NEXT: s_sub_i32 s0, 32, s10 +; GFX7-NEXT: v_ldexp_f32_e64 v2, v2, s0 ; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB29_6: ; %atomicrmw.start2 @@ -9733,8 +9898,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 -; GFX7-NEXT: v_add_f32_e32 v0, s10, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s10 +; GFX7-NEXT: v_add_f32_e32 v0, s13, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-NEXT: s_mov_b64 s[0:1], exec ; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX7-NEXT: v_bfrev_b32_e32 v1, 1 @@ -9797,19 +9962,27 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_mov_b32 s9, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_add_i32 s3, s3, 4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_flbit_i32_b32 s12, 0 ; GFX6-NEXT: ; implicit-def: $vgpr1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_cbranch_execz .LBB29_4 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_lshl_b32 s8, s3, 3 -; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: s_bcnt1_i32_b64 s8, s[0:1] +; GFX6-NEXT: s_min_u32 s11, s12, 32 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[8:9], s11 +; GFX6-NEXT: s_min_u32 s0, s0, 1 +; GFX6-NEXT: s_lshl_b32 s10, s3, 3 +; GFX6-NEXT: s_or_b32 s0, s1, s0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s0 +; GFX6-NEXT: v_mov_b32_e32 v2, s10 ; GFX6-NEXT: ds_read_b32 v1, v2 -; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 +; GFX6-NEXT: s_sub_i32 s0, 32, s11 +; GFX6-NEXT: v_ldexp_f32_e64 v3, v3, s0 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB29_2: ; %atomicrmw.start @@ -9827,19 +10000,26 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: .LBB29_4: ; %Flow21 ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_mov_b64 s[8:9], exec -; GFX6-NEXT: v_readfirstlane_b32 s10, v1 -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s8, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s9, v1 +; GFX6-NEXT: s_mov_b64 s[10:11], exec +; GFX6-NEXT: v_readfirstlane_b32 s13, v1 +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s10, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s11, v1 +; GFX6-NEXT: s_mov_b32 s9, 0 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 ; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] ; GFX6-NEXT: s_cbranch_execz .LBB29_7 ; GFX6-NEXT: ; %bb.5: +; GFX6-NEXT: s_bcnt1_i32_b64 s8, s[10:11] +; GFX6-NEXT: s_min_u32 s10, s12, 32 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[8:9], s10 +; GFX6-NEXT: s_min_u32 s0, s0, 1 +; GFX6-NEXT: s_or_b32 s0, s1, s0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s0 ; GFX6-NEXT: s_lshl_b32 s0, s3, 4 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_read_b32 v3, v1 -; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 +; GFX6-NEXT: s_sub_i32 s0, 32, s10 +; GFX6-NEXT: v_ldexp_f32_e64 v2, v2, s0 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB29_6: ; %atomicrmw.start2 @@ -9857,8 +10037,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 -; GFX6-NEXT: v_add_f32_e32 v0, s10, v0 -; GFX6-NEXT: v_mov_b32_e32 v1, s10 +; GFX6-NEXT: v_add_f32_e32 v0, s13, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s13 ; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index df496258a2509..e268eedea6c36 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -459,64 +459,110 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_sdiv24_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv24_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dword s2, s[4:5], 0xe -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s6, s[4:5], 0xe +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i32 s0, s2, 8 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_ashr_i32 s1, s3, 8 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_xor_b32 s0, s1, s0 -; GCN-NEXT: s_ashr_i32 s0, s0, 30 -; GCN-NEXT: s_or_b32 s2, s0, 1 +; GCN-NEXT: s_ashr_i32 s5, s11, 31 +; GCN-NEXT: s_ashr_i32 s4, s11, 8 +; GCN-NEXT: s_mov_b32 s1, s9 +; GCN-NEXT: s_xor_b32 s9, s4, s5 +; GCN-NEXT: s_ashr_i32 s9, s9, 31 +; GCN-NEXT: s_flbit_i32 s10, s5 +; GCN-NEXT: s_ashr_i32 s7, s6, 31 +; GCN-NEXT: s_ashr_i32 s6, s6, 8 +; GCN-NEXT: s_add_i32 s9, s9, 32 +; GCN-NEXT: s_add_i32 s10, s10, -1 +; GCN-NEXT: s_min_u32 s9, s10, s9 +; GCN-NEXT: s_xor_b32 s10, s6, s7 +; GCN-NEXT: s_ashr_i32 s10, s10, 31 +; GCN-NEXT: s_flbit_i32 s11, s7 +; GCN-NEXT: s_add_i32 s10, s10, 32 +; GCN-NEXT: s_add_i32 s11, s11, -1 +; GCN-NEXT: s_min_u32 s10, s11, s10 +; GCN-NEXT: s_mov_b32 s0, s8 +; GCN-NEXT: s_xor_b32 s8, s4, s6 +; GCN-NEXT: s_lshl_b64 s[6:7], s[6:7], s10 +; GCN-NEXT: s_min_u32 s6, s6, 1 +; GCN-NEXT: s_or_b32 s6, s7, s6 +; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], s9 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s6 +; GCN-NEXT: s_min_u32 s4, s4, 1 +; GCN-NEXT: s_or_b32 s4, s5, s4 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GCN-NEXT: s_sub_i32 s4, 32, s10 +; GCN-NEXT: v_ldexp_f32_e64 v0, v0, s4 +; GCN-NEXT: v_rcp_f32_e32 v2, v0 +; GCN-NEXT: s_sub_i32 s4, 32, s9 +; GCN-NEXT: v_ldexp_f32_e64 v1, v1, s4 +; GCN-NEXT: s_ashr_i32 s8, s8, 30 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| -; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN-NEXT: s_cselect_b32 s0, s2, 0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; GCN-NEXT: s_or_b32 s6, s8, 1 +; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-NEXT: s_cselect_b32 s4, s6, 0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v2 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_sdiv24_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_load_dword s2, s[4:5], 0xe -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GCN-IR-NEXT: s_load_dword s6, s[4:5], 0xe +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i32 s0, s2, 8 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: s_ashr_i32 s1, s3, 8 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_xor_b32 s0, s1, s0 -; GCN-IR-NEXT: s_ashr_i32 s0, s0, 30 -; GCN-IR-NEXT: s_or_b32 s2, s0, 1 +; GCN-IR-NEXT: s_ashr_i32 s5, s11, 31 +; GCN-IR-NEXT: s_ashr_i32 s4, s11, 8 +; GCN-IR-NEXT: s_mov_b32 s1, s9 +; GCN-IR-NEXT: s_xor_b32 s9, s4, s5 +; GCN-IR-NEXT: s_ashr_i32 s9, s9, 31 +; GCN-IR-NEXT: s_flbit_i32 s10, s5 +; GCN-IR-NEXT: s_ashr_i32 s7, s6, 31 +; GCN-IR-NEXT: s_ashr_i32 s6, s6, 8 +; GCN-IR-NEXT: s_add_i32 s9, s9, 32 +; GCN-IR-NEXT: s_add_i32 s10, s10, -1 +; GCN-IR-NEXT: s_min_u32 s9, s10, s9 +; GCN-IR-NEXT: s_xor_b32 s10, s6, s7 +; GCN-IR-NEXT: s_ashr_i32 s10, s10, 31 +; GCN-IR-NEXT: s_flbit_i32 s11, s7 +; GCN-IR-NEXT: s_add_i32 s10, s10, 32 +; GCN-IR-NEXT: s_add_i32 s11, s11, -1 +; GCN-IR-NEXT: s_min_u32 s10, s11, s10 +; GCN-IR-NEXT: s_mov_b32 s0, s8 +; GCN-IR-NEXT: s_xor_b32 s8, s4, s6 +; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], s10 +; GCN-IR-NEXT: s_min_u32 s6, s6, 1 +; GCN-IR-NEXT: s_or_b32 s6, s7, s6 +; GCN-IR-NEXT: s_lshl_b64 s[4:5], s[4:5], s9 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s6 +; GCN-IR-NEXT: s_min_u32 s4, s4, 1 +; GCN-IR-NEXT: s_or_b32 s4, s5, s4 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GCN-IR-NEXT: s_sub_i32 s4, 32, s10 +; GCN-IR-NEXT: v_ldexp_f32_e64 v0, v0, s4 +; GCN-IR-NEXT: v_rcp_f32_e32 v2, v0 +; GCN-IR-NEXT: s_sub_i32 s4, 32, s9 +; GCN-IR-NEXT: v_ldexp_f32_e64 v1, v1, s4 +; GCN-IR-NEXT: s_ashr_i32 s8, s8, 30 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| -; GCN-IR-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN-IR-NEXT: s_cselect_b32 s0, s2, 0 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; GCN-IR-NEXT: s_or_b32 s6, s8, 1 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_cselect_b32 s4, s6, 0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s4, v2 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i64 %x, 40 %2 = ashr i64 %y, 40 @@ -529,54 +575,70 @@ define i64 @v_test_sdiv24_64(i64 %x, i64 %y) { ; GCN-LABEL: v_test_sdiv24_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v0, 8, v3 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, v0 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v0 +; GCN-NEXT: s_flbit_i32_b32 s4, 0 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v3 +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: s_min_u32 s4, s4, 32 +; GCN-NEXT: v_lshl_b64 v[3:4], v[2:3], s4 +; GCN-NEXT: s_sub_i32 s4, 32, s4 +; GCN-NEXT: v_min_u32_e32 v0, 1, v3 +; GCN-NEXT: v_or_b32_e32 v0, v4, v0 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GCN-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GCN-NEXT: v_mul_lo_u32 v3, v3, v2 -; GCN-NEXT: v_mul_hi_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_mul_hi_u32 v2, v1, v2 -; GCN-NEXT: v_mul_u32_u24_e32 v3, v2, v0 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v2 +; GCN-NEXT: v_ldexp_f32_e64 v0, v0, s4 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-NEXT: v_mul_lo_u32 v3, v3, v0 +; GCN-NEXT: v_mul_hi_u32 v3, v0, v3 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v0, v1, v0 +; GCN-NEXT: v_mul_u32_u24_e32 v3, v0, v2 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v0 ; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GCN-NEXT: v_sub_i32_e64 v3, s[4:5], v1, v0 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN-NEXT: v_sub_i32_e64 v3, s[4:5], v1, v2 ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v2 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v0 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GCN-IR-LABEL: v_test_sdiv24_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 8, v3 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v2, v0 -; GCN-IR-NEXT: v_sub_i32_e32 v3, vcc, 0, v0 +; GCN-IR-NEXT: s_flbit_i32_b32 s4, 0 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 8, v3 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: s_min_u32 s4, s4, 32 +; GCN-IR-NEXT: v_lshl_b64 v[3:4], v[2:3], s4 +; GCN-IR-NEXT: s_sub_i32 s4, 32, s4 +; GCN-IR-NEXT: v_min_u32_e32 v0, 1, v3 +; GCN-IR-NEXT: v_or_b32_e32 v0, v4, v0 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GCN-IR-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GCN-IR-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_mul_lo_u32 v3, v3, v2 -; GCN-IR-NEXT: v_mul_hi_u32 v3, v2, v3 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-IR-NEXT: v_mul_hi_u32 v2, v1, v2 -; GCN-IR-NEXT: v_mul_u32_u24_e32 v3, v2, v0 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 1, v2 +; GCN-IR-NEXT: v_ldexp_f32_e64 v0, v0, s4 +; GCN-IR-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v3, v3, v0 +; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v3 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; GCN-IR-NEXT: v_mul_hi_u32 v0, v1, v0 +; GCN-IR-NEXT: v_mul_u32_u24_e32 v3, v0, v2 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 1, v0 ; GCN-IR-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 -; GCN-IR-NEXT: v_cmp_ge_u32_e32 vcc, v1, v0 -; GCN-IR-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v3, s[4:5], v1, v0 +; GCN-IR-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v3, s[4:5], v1, v2 ; GCN-IR-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, 1, v2 -; GCN-IR-NEXT: v_cmp_ge_u32_e32 vcc, v1, v0 -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, 1, v0 +; GCN-IR-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: s_setpc_b64 s[30:31] %1 = lshr i64 %x, 40 @@ -772,64 +834,110 @@ define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_sdiv23_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv23_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dword s2, s[4:5], 0xe -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s6, s[4:5], 0xe +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i32 s0, s2, 9 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_ashr_i32 s1, s3, 9 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_xor_b32 s0, s1, s0 -; GCN-NEXT: s_ashr_i32 s0, s0, 30 -; GCN-NEXT: s_or_b32 s2, s0, 1 +; GCN-NEXT: s_ashr_i32 s5, s11, 31 +; GCN-NEXT: s_ashr_i32 s4, s11, 9 +; GCN-NEXT: s_mov_b32 s1, s9 +; GCN-NEXT: s_xor_b32 s9, s4, s5 +; GCN-NEXT: s_ashr_i32 s9, s9, 31 +; GCN-NEXT: s_flbit_i32 s10, s5 +; GCN-NEXT: s_ashr_i32 s7, s6, 31 +; GCN-NEXT: s_ashr_i32 s6, s6, 9 +; GCN-NEXT: s_add_i32 s9, s9, 32 +; GCN-NEXT: s_add_i32 s10, s10, -1 +; GCN-NEXT: s_min_u32 s9, s10, s9 +; GCN-NEXT: s_xor_b32 s10, s6, s7 +; GCN-NEXT: s_ashr_i32 s10, s10, 31 +; GCN-NEXT: s_flbit_i32 s11, s7 +; GCN-NEXT: s_add_i32 s10, s10, 32 +; GCN-NEXT: s_add_i32 s11, s11, -1 +; GCN-NEXT: s_min_u32 s10, s11, s10 +; GCN-NEXT: s_mov_b32 s0, s8 +; GCN-NEXT: s_xor_b32 s8, s4, s6 +; GCN-NEXT: s_lshl_b64 s[6:7], s[6:7], s10 +; GCN-NEXT: s_min_u32 s6, s6, 1 +; GCN-NEXT: s_or_b32 s6, s7, s6 +; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], s9 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s6 +; GCN-NEXT: s_min_u32 s4, s4, 1 +; GCN-NEXT: s_or_b32 s4, s5, s4 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GCN-NEXT: s_sub_i32 s4, 32, s10 +; GCN-NEXT: v_ldexp_f32_e64 v0, v0, s4 +; GCN-NEXT: v_rcp_f32_e32 v2, v0 +; GCN-NEXT: s_sub_i32 s4, 32, s9 +; GCN-NEXT: v_ldexp_f32_e64 v1, v1, s4 +; GCN-NEXT: s_ashr_i32 s8, s8, 30 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| -; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN-NEXT: s_cselect_b32 s0, s2, 0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; GCN-NEXT: s_or_b32 s6, s8, 1 +; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-NEXT: s_cselect_b32 s4, s6, 0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v2 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 23 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_sdiv23_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_load_dword s2, s[4:5], 0xe -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GCN-IR-NEXT: s_load_dword s6, s[4:5], 0xe +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i32 s0, s2, 9 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: s_ashr_i32 s1, s3, 9 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_xor_b32 s0, s1, s0 -; GCN-IR-NEXT: s_ashr_i32 s0, s0, 30 -; GCN-IR-NEXT: s_or_b32 s2, s0, 1 +; GCN-IR-NEXT: s_ashr_i32 s5, s11, 31 +; GCN-IR-NEXT: s_ashr_i32 s4, s11, 9 +; GCN-IR-NEXT: s_mov_b32 s1, s9 +; GCN-IR-NEXT: s_xor_b32 s9, s4, s5 +; GCN-IR-NEXT: s_ashr_i32 s9, s9, 31 +; GCN-IR-NEXT: s_flbit_i32 s10, s5 +; GCN-IR-NEXT: s_ashr_i32 s7, s6, 31 +; GCN-IR-NEXT: s_ashr_i32 s6, s6, 9 +; GCN-IR-NEXT: s_add_i32 s9, s9, 32 +; GCN-IR-NEXT: s_add_i32 s10, s10, -1 +; GCN-IR-NEXT: s_min_u32 s9, s10, s9 +; GCN-IR-NEXT: s_xor_b32 s10, s6, s7 +; GCN-IR-NEXT: s_ashr_i32 s10, s10, 31 +; GCN-IR-NEXT: s_flbit_i32 s11, s7 +; GCN-IR-NEXT: s_add_i32 s10, s10, 32 +; GCN-IR-NEXT: s_add_i32 s11, s11, -1 +; GCN-IR-NEXT: s_min_u32 s10, s11, s10 +; GCN-IR-NEXT: s_mov_b32 s0, s8 +; GCN-IR-NEXT: s_xor_b32 s8, s4, s6 +; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], s10 +; GCN-IR-NEXT: s_min_u32 s6, s6, 1 +; GCN-IR-NEXT: s_or_b32 s6, s7, s6 +; GCN-IR-NEXT: s_lshl_b64 s[4:5], s[4:5], s9 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s6 +; GCN-IR-NEXT: s_min_u32 s4, s4, 1 +; GCN-IR-NEXT: s_or_b32 s4, s5, s4 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GCN-IR-NEXT: s_sub_i32 s4, 32, s10 +; GCN-IR-NEXT: v_ldexp_f32_e64 v0, v0, s4 +; GCN-IR-NEXT: v_rcp_f32_e32 v2, v0 +; GCN-IR-NEXT: s_sub_i32 s4, 32, s9 +; GCN-IR-NEXT: v_ldexp_f32_e64 v1, v1, s4 +; GCN-IR-NEXT: s_ashr_i32 s8, s8, 30 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| -; GCN-IR-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN-IR-NEXT: s_cselect_b32 s0, s2, 0 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; GCN-IR-NEXT: s_or_b32 s6, s8, 1 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_cselect_b32 s4, s6, 0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s4, v2 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 23 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i64 %x, 41 %2 = ashr i64 %y, 41 @@ -941,32 +1049,78 @@ define amdgpu_kernel void @s_test_sdiv24_v2i64(ptr addrspace(1) %out, <2 x i64> ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i32 s4, s13, 8 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GCN-NEXT: s_ashr_i32 s5, s9, 8 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s5 -; GCN-NEXT: s_xor_b32 s4, s5, s4 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_ashr_i32 s4, s4, 30 -; GCN-NEXT: s_ashr_i32 s6, s11, 8 -; GCN-NEXT: s_ashr_i32 s7, s15, 8 +; GCN-NEXT: s_ashr_i32 s7, s9, 31 +; GCN-NEXT: s_ashr_i32 s6, s9, 8 +; GCN-NEXT: s_ashr_i32 s5, s11, 31 +; GCN-NEXT: s_ashr_i32 s4, s11, 8 +; GCN-NEXT: s_ashr_i32 s11, s13, 31 +; GCN-NEXT: s_ashr_i32 s10, s13, 8 +; GCN-NEXT: s_xor_b32 s13, s6, s7 +; GCN-NEXT: s_ashr_i32 s13, s13, 31 +; GCN-NEXT: s_flbit_i32 s14, s7 +; GCN-NEXT: s_add_i32 s13, s13, 32 +; GCN-NEXT: s_add_i32 s14, s14, -1 +; GCN-NEXT: s_min_u32 s13, s14, s13 +; GCN-NEXT: s_xor_b32 s14, s10, s11 +; GCN-NEXT: s_ashr_i32 s9, s15, 31 +; GCN-NEXT: s_ashr_i32 s8, s15, 8 +; GCN-NEXT: s_ashr_i32 s14, s14, 31 +; GCN-NEXT: s_flbit_i32 s15, s11 +; GCN-NEXT: s_add_i32 s14, s14, 32 +; GCN-NEXT: s_add_i32 s15, s15, -1 +; GCN-NEXT: s_min_u32 s14, s15, s14 +; GCN-NEXT: s_xor_b32 s12, s6, s10 +; GCN-NEXT: s_lshl_b64 s[10:11], s[10:11], s14 +; GCN-NEXT: s_min_u32 s10, s10, 1 +; GCN-NEXT: s_or_b32 s10, s11, s10 +; GCN-NEXT: s_lshl_b64 s[6:7], s[6:7], s13 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s10 +; GCN-NEXT: s_min_u32 s6, s6, 1 +; GCN-NEXT: s_or_b32 s6, s7, s6 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s6 +; GCN-NEXT: s_sub_i32 s6, 32, s14 +; GCN-NEXT: v_ldexp_f32_e64 v0, v0, s6 +; GCN-NEXT: v_rcp_f32_e32 v2, v0 +; GCN-NEXT: s_sub_i32 s6, 32, s13 +; GCN-NEXT: v_ldexp_f32_e64 v1, v1, s6 +; GCN-NEXT: s_ashr_i32 s12, s12, 30 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: s_or_b32 s8, s4, 1 -; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| -; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GCN-NEXT: s_cselect_b32 s4, s8, 0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v2 -; GCN-NEXT: v_cvt_f32_i32_e32 v2, s7 -; GCN-NEXT: v_cvt_f32_i32_e32 v3, s6 -; GCN-NEXT: s_xor_b32 s4, s6, s7 -; GCN-NEXT: s_ashr_i32 s4, s4, 30 -; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GCN-NEXT: s_or_b32 s6, s4, 1 -; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 -; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NEXT: s_or_b32 s10, s12, 1 +; GCN-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0| +; GCN-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN-NEXT: s_cselect_b32 s6, s10, 0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v2 +; GCN-NEXT: s_xor_b32 s6, s4, s8 +; GCN-NEXT: s_ashr_i32 s10, s6, 30 +; GCN-NEXT: s_xor_b32 s6, s4, s5 +; GCN-NEXT: s_ashr_i32 s6, s6, 31 +; GCN-NEXT: s_flbit_i32 s7, s5 +; GCN-NEXT: s_add_i32 s6, s6, 32 +; GCN-NEXT: s_add_i32 s7, s7, -1 +; GCN-NEXT: s_min_u32 s11, s7, s6 +; GCN-NEXT: s_xor_b32 s6, s8, s9 +; GCN-NEXT: s_ashr_i32 s6, s6, 31 +; GCN-NEXT: s_flbit_i32 s7, s9 +; GCN-NEXT: s_add_i32 s6, s6, 32 +; GCN-NEXT: s_add_i32 s7, s7, -1 +; GCN-NEXT: s_min_u32 s12, s7, s6 +; GCN-NEXT: s_lshl_b64 s[6:7], s[8:9], s12 +; GCN-NEXT: s_min_u32 s6, s6, 1 +; GCN-NEXT: s_or_b32 s6, s7, s6 +; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], s11 +; GCN-NEXT: v_cvt_f32_i32_e32 v2, s6 +; GCN-NEXT: s_min_u32 s4, s4, 1 +; GCN-NEXT: s_or_b32 s4, s5, s4 +; GCN-NEXT: v_cvt_f32_i32_e32 v3, s4 +; GCN-NEXT: s_sub_i32 s4, 32, s12 +; GCN-NEXT: v_ldexp_f32_e64 v2, v2, s4 +; GCN-NEXT: v_rcp_f32_e32 v4, v2 +; GCN-NEXT: s_sub_i32 s4, 32, s11 +; GCN-NEXT: v_ldexp_f32_e64 v3, v3, s4 +; GCN-NEXT: s_or_b32 s6, s10, 1 ; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 ; GCN-NEXT: v_trunc_f32_e32 v4, v4 ; GCN-NEXT: v_mad_f32 v3, -v4, v2, v3 @@ -975,7 +1129,9 @@ define amdgpu_kernel void @s_test_sdiv24_v2i64(ptr addrspace(1) %out, <2 x i64> ; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN-NEXT: s_cselect_b32 s4, s6, 0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, s4, v4 +; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_bfe_i32 v2, v2, 0, 24 +; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm @@ -987,32 +1143,78 @@ define amdgpu_kernel void @s_test_sdiv24_v2i64(ptr addrspace(1) %out, <2 x i64> ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i32 s4, s13, 8 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GCN-IR-NEXT: s_ashr_i32 s5, s9, 8 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s5 -; GCN-IR-NEXT: s_xor_b32 s4, s5, s4 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_ashr_i32 s4, s4, 30 -; GCN-IR-NEXT: s_ashr_i32 s6, s11, 8 -; GCN-IR-NEXT: s_ashr_i32 s7, s15, 8 +; GCN-IR-NEXT: s_ashr_i32 s7, s9, 31 +; GCN-IR-NEXT: s_ashr_i32 s6, s9, 8 +; GCN-IR-NEXT: s_ashr_i32 s5, s11, 31 +; GCN-IR-NEXT: s_ashr_i32 s4, s11, 8 +; GCN-IR-NEXT: s_ashr_i32 s11, s13, 31 +; GCN-IR-NEXT: s_ashr_i32 s10, s13, 8 +; GCN-IR-NEXT: s_xor_b32 s13, s6, s7 +; GCN-IR-NEXT: s_ashr_i32 s13, s13, 31 +; GCN-IR-NEXT: s_flbit_i32 s14, s7 +; GCN-IR-NEXT: s_add_i32 s13, s13, 32 +; GCN-IR-NEXT: s_add_i32 s14, s14, -1 +; GCN-IR-NEXT: s_min_u32 s13, s14, s13 +; GCN-IR-NEXT: s_xor_b32 s14, s10, s11 +; GCN-IR-NEXT: s_ashr_i32 s9, s15, 31 +; GCN-IR-NEXT: s_ashr_i32 s8, s15, 8 +; GCN-IR-NEXT: s_ashr_i32 s14, s14, 31 +; GCN-IR-NEXT: s_flbit_i32 s15, s11 +; GCN-IR-NEXT: s_add_i32 s14, s14, 32 +; GCN-IR-NEXT: s_add_i32 s15, s15, -1 +; GCN-IR-NEXT: s_min_u32 s14, s15, s14 +; GCN-IR-NEXT: s_xor_b32 s12, s6, s10 +; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], s14 +; GCN-IR-NEXT: s_min_u32 s10, s10, 1 +; GCN-IR-NEXT: s_or_b32 s10, s11, s10 +; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], s13 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s10 +; GCN-IR-NEXT: s_min_u32 s6, s6, 1 +; GCN-IR-NEXT: s_or_b32 s6, s7, s6 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s6 +; GCN-IR-NEXT: s_sub_i32 s6, 32, s14 +; GCN-IR-NEXT: v_ldexp_f32_e64 v0, v0, s6 +; GCN-IR-NEXT: v_rcp_f32_e32 v2, v0 +; GCN-IR-NEXT: s_sub_i32 s6, 32, s13 +; GCN-IR-NEXT: v_ldexp_f32_e64 v1, v1, s6 +; GCN-IR-NEXT: s_ashr_i32 s12, s12, 30 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: s_or_b32 s8, s4, 1 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| -; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GCN-IR-NEXT: s_cselect_b32 s4, s8, 0 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s4, v2 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v2, s7 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v3, s6 -; GCN-IR-NEXT: s_xor_b32 s4, s6, s7 -; GCN-IR-NEXT: s_ashr_i32 s4, s4, 30 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GCN-IR-NEXT: s_or_b32 s6, s4, 1 -; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-IR-NEXT: s_or_b32 s10, s12, 1 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0| +; GCN-IR-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN-IR-NEXT: s_cselect_b32 s6, s10, 0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s6, v2 +; GCN-IR-NEXT: s_xor_b32 s6, s4, s8 +; GCN-IR-NEXT: s_ashr_i32 s10, s6, 30 +; GCN-IR-NEXT: s_xor_b32 s6, s4, s5 +; GCN-IR-NEXT: s_ashr_i32 s6, s6, 31 +; GCN-IR-NEXT: s_flbit_i32 s7, s5 +; GCN-IR-NEXT: s_add_i32 s6, s6, 32 +; GCN-IR-NEXT: s_add_i32 s7, s7, -1 +; GCN-IR-NEXT: s_min_u32 s11, s7, s6 +; GCN-IR-NEXT: s_xor_b32 s6, s8, s9 +; GCN-IR-NEXT: s_ashr_i32 s6, s6, 31 +; GCN-IR-NEXT: s_flbit_i32 s7, s9 +; GCN-IR-NEXT: s_add_i32 s6, s6, 32 +; GCN-IR-NEXT: s_add_i32 s7, s7, -1 +; GCN-IR-NEXT: s_min_u32 s12, s7, s6 +; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[8:9], s12 +; GCN-IR-NEXT: s_min_u32 s6, s6, 1 +; GCN-IR-NEXT: s_or_b32 s6, s7, s6 +; GCN-IR-NEXT: s_lshl_b64 s[4:5], s[4:5], s11 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v2, s6 +; GCN-IR-NEXT: s_min_u32 s4, s4, 1 +; GCN-IR-NEXT: s_or_b32 s4, s5, s4 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v3, s4 +; GCN-IR-NEXT: s_sub_i32 s4, 32, s12 +; GCN-IR-NEXT: v_ldexp_f32_e64 v2, v2, s4 +; GCN-IR-NEXT: v_rcp_f32_e32 v4, v2 +; GCN-IR-NEXT: s_sub_i32 s4, 32, s11 +; GCN-IR-NEXT: v_ldexp_f32_e64 v3, v3, s4 +; GCN-IR-NEXT: s_or_b32 s6, s10, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v4, v3, v4 ; GCN-IR-NEXT: v_trunc_f32_e32 v4, v4 ; GCN-IR-NEXT: v_mad_f32 v3, -v4, v2, v3 @@ -1021,7 +1223,9 @@ define amdgpu_kernel void @s_test_sdiv24_v2i64(ptr addrspace(1) %out, <2 x i64> ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN-IR-NEXT: s_cselect_b32 s4, s6, 0 ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s4, v4 +; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_bfe_i32 v2, v2, 0, 24 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm @@ -1797,14 +2001,25 @@ define amdgpu_kernel void @s_test_sdiv24_k_num_i64(ptr addrspace(1) %out, i64 %x ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i32 s2, s3, 8 +; GCN-NEXT: s_ashr_i32 s9, s3, 31 +; GCN-NEXT: s_ashr_i32 s8, s3, 8 +; GCN-NEXT: s_xor_b32 s2, s8, s9 +; GCN-NEXT: s_flbit_i32 s3, s9 +; GCN-NEXT: s_ashr_i32 s2, s2, 31 +; GCN-NEXT: s_add_i32 s3, s3, -1 +; GCN-NEXT: s_add_i32 s2, s2, 32 +; GCN-NEXT: s_min_u32 s4, s3, s2 +; GCN-NEXT: s_lshl_b64 s[2:3], s[8:9], s4 +; GCN-NEXT: s_min_u32 s2, s2, 1 +; GCN-NEXT: s_or_b32 s2, s3, s2 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GCN-NEXT: s_sub_i32 s3, 32, s4 ; GCN-NEXT: s_mov_b32 s2, 0x41c00000 ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_ashr_i32 s0, s3, 31 -; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GCN-NEXT: v_ldexp_f32_e64 v0, v0, s3 +; GCN-NEXT: v_rcp_f32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_or_b32 s3, s0, 1 +; GCN-NEXT: s_or_b32 s3, s9, 1 ; GCN-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_mad_f32 v2, -v1, v0, s2 @@ -1824,14 +2039,25 @@ define amdgpu_kernel void @s_test_sdiv24_k_num_i64(ptr addrspace(1) %out, i64 %x ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i32 s2, s3, 8 +; GCN-IR-NEXT: s_ashr_i32 s9, s3, 31 +; GCN-IR-NEXT: s_ashr_i32 s8, s3, 8 +; GCN-IR-NEXT: s_xor_b32 s2, s8, s9 +; GCN-IR-NEXT: s_flbit_i32 s3, s9 +; GCN-IR-NEXT: s_ashr_i32 s2, s2, 31 +; GCN-IR-NEXT: s_add_i32 s3, s3, -1 +; GCN-IR-NEXT: s_add_i32 s2, s2, 32 +; GCN-IR-NEXT: s_min_u32 s4, s3, s2 +; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[8:9], s4 +; GCN-IR-NEXT: s_min_u32 s2, s2, 1 +; GCN-IR-NEXT: s_or_b32 s2, s3, s2 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GCN-IR-NEXT: s_sub_i32 s3, 32, s4 ; GCN-IR-NEXT: s_mov_b32 s2, 0x41c00000 ; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_ashr_i32 s0, s3, 31 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GCN-IR-NEXT: v_ldexp_f32_e64 v0, v0, s3 +; GCN-IR-NEXT: v_rcp_f32_e32 v1, v0 ; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: s_or_b32 s3, s0, 1 +; GCN-IR-NEXT: s_or_b32 s3, s9, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_mad_f32 v2, -v1, v0, s2 @@ -1854,23 +2080,34 @@ define amdgpu_kernel void @s_test_sdiv24_k_den_i64(ptr addrspace(1) %out, i64 %x ; GCN-LABEL: s_test_sdiv24_k_den_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s2, 0x46b6fe00 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_ashr_i32 s9, s3, 31 +; GCN-NEXT: s_ashr_i32 s8, s3, 8 ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_ashr_i32 s0, s3, 8 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GCN-NEXT: s_ashr_i32 s0, s3, 31 +; GCN-NEXT: s_xor_b32 s0, s8, s9 +; GCN-NEXT: s_ashr_i32 s0, s0, 31 +; GCN-NEXT: s_flbit_i32 s2, s9 +; GCN-NEXT: s_add_i32 s0, s0, 32 +; GCN-NEXT: s_add_i32 s2, s2, -1 +; GCN-NEXT: s_min_u32 s0, s2, s0 +; GCN-NEXT: s_lshl_b64 s[2:3], s[8:9], s0 +; GCN-NEXT: s_min_u32 s2, s2, 1 +; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GCN-NEXT: s_sub_i32 s0, 32, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_or_b32 s3, s0, 1 +; GCN-NEXT: s_or_b32 s2, s9, 1 +; GCN-NEXT: v_ldexp_f32_e64 v0, v0, s0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_mad_f32 v0, -v1, s2, v0 +; GCN-NEXT: s_mov_b32 s0, 0x46b6fe00 +; GCN-NEXT: v_mad_f32 v0, -v1, s0, v0 ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v0|, s2 +; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v0|, s0 ; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN-NEXT: s_cselect_b32 s0, s3, 0 +; GCN-NEXT: s_cselect_b32 s0, s2, 0 ; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v1 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -1880,23 +2117,34 @@ define amdgpu_kernel void @s_test_sdiv24_k_den_i64(ptr addrspace(1) %out, i64 %x ; GCN-IR-LABEL: s_test_sdiv24_k_den_i64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_mov_b32 s2, 0x46b6fe00 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_ashr_i32 s9, s3, 31 +; GCN-IR-NEXT: s_ashr_i32 s8, s3, 8 ; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_ashr_i32 s0, s3, 8 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GCN-IR-NEXT: s_ashr_i32 s0, s3, 31 +; GCN-IR-NEXT: s_xor_b32 s0, s8, s9 +; GCN-IR-NEXT: s_ashr_i32 s0, s0, 31 +; GCN-IR-NEXT: s_flbit_i32 s2, s9 +; GCN-IR-NEXT: s_add_i32 s0, s0, 32 +; GCN-IR-NEXT: s_add_i32 s2, s2, -1 +; GCN-IR-NEXT: s_min_u32 s0, s2, s0 +; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[8:9], s0 +; GCN-IR-NEXT: s_min_u32 s2, s2, 1 +; GCN-IR-NEXT: s_or_b32 s2, s3, s2 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GCN-IR-NEXT: s_sub_i32 s0, 32, s0 ; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: s_or_b32 s3, s0, 1 +; GCN-IR-NEXT: s_or_b32 s2, s9, 1 +; GCN-IR-NEXT: v_ldexp_f32_e64 v0, v0, s0 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-IR-NEXT: v_mad_f32 v0, -v1, s2, v0 +; GCN-IR-NEXT: s_mov_b32 s0, 0x46b6fe00 +; GCN-IR-NEXT: v_mad_f32 v0, -v1, s0, v0 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[0:1], |v0|, s2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[0:1], |v0|, s0 ; GCN-IR-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN-IR-NEXT: s_cselect_b32 s0, s3, 0 +; GCN-IR-NEXT: s_cselect_b32 s0, s2, 0 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s0, v1 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -1912,19 +2160,30 @@ define i64 @v_test_sdiv24_k_num_i64(i64 %x) { ; GCN-LABEL: v_test_sdiv24_k_num_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_ashrrev_i32_e32 v0, 8, v1 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GCN-NEXT: v_ashrrev_i32_e32 v1, 8, v1 +; GCN-NEXT: v_xor_b32_e32 v0, v1, v2 +; GCN-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GCN-NEXT: v_ffbh_i32_e32 v3, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v3 +; GCN-NEXT: v_min_u32_e32 v3, v3, v0 +; GCN-NEXT: v_lshl_b64 v[0:1], v[1:2], v3 ; GCN-NEXT: s_mov_b32 s4, 0x41c00000 -; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v1 -; GCN-NEXT: v_or_b32_e32 v1, 1, v1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 0x41c00000, v2 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mad_f32 v3, -v2, v0, s4 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: v_min_u32_e32 v0, 1, v0 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, 32, v3 +; GCN-NEXT: v_or_b32_e32 v2, 1, v2 +; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GCN-NEXT: v_rcp_f32_e32 v1, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_mad_f32 v3, -v1, v0, s4 +; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v0| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1932,19 +2191,30 @@ define i64 @v_test_sdiv24_k_num_i64(i64 %x) { ; GCN-IR-LABEL: v_test_sdiv24_k_num_i64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IR-NEXT: v_ashrrev_i32_e32 v0, 8, v1 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 8, v1 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v1, v2 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GCN-IR-NEXT: v_ffbh_i32_e32 v3, v2 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, -1, v3 +; GCN-IR-NEXT: v_min_u32_e32 v3, v3, v0 +; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[1:2], v3 ; GCN-IR-NEXT: s_mov_b32 s4, 0x41c00000 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v1 -; GCN-IR-NEXT: v_or_b32_e32 v1, 1, v1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: v_mul_f32_e32 v2, 0x41c00000, v2 -; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-IR-NEXT: v_mad_f32 v3, -v2, v0, s4 -; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_min_u32_e32 v0, 1, v0 +; GCN-IR-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GCN-IR-NEXT: v_sub_i32_e32 v1, vcc, 32, v3 +; GCN-IR-NEXT: v_or_b32_e32 v2, 1, v2 +; GCN-IR-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GCN-IR-NEXT: v_rcp_f32_e32 v1, v0 +; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 +; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-IR-NEXT: v_mad_f32 v3, -v1, v0, s4 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v0| -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: s_setpc_b64 s[30:31] @@ -1957,19 +2227,30 @@ define i64 @v_test_sdiv24_pow2_k_num_i64(i64 %x) { ; GCN-LABEL: v_test_sdiv24_pow2_k_num_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_ashrrev_i32_e32 v0, 8, v1 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GCN-NEXT: v_ashrrev_i32_e32 v1, 8, v1 +; GCN-NEXT: v_xor_b32_e32 v0, v1, v2 +; GCN-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GCN-NEXT: v_ffbh_i32_e32 v3, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v3 +; GCN-NEXT: v_min_u32_e32 v3, v3, v0 +; GCN-NEXT: v_lshl_b64 v[0:1], v[1:2], v3 ; GCN-NEXT: s_mov_b32 s4, 0x47000000 -; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v1 -; GCN-NEXT: v_or_b32_e32 v1, 1, v1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 0x47000000, v2 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mad_f32 v3, -v2, v0, s4 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: v_min_u32_e32 v0, 1, v0 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, 32, v3 +; GCN-NEXT: v_or_b32_e32 v2, 1, v2 +; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GCN-NEXT: v_rcp_f32_e32 v1, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 0x47000000, v1 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_mad_f32 v3, -v1, v0, s4 +; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v0| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1977,19 +2258,30 @@ define i64 @v_test_sdiv24_pow2_k_num_i64(i64 %x) { ; GCN-IR-LABEL: v_test_sdiv24_pow2_k_num_i64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IR-NEXT: v_ashrrev_i32_e32 v0, 8, v1 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 8, v1 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v1, v2 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GCN-IR-NEXT: v_ffbh_i32_e32 v3, v2 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, -1, v3 +; GCN-IR-NEXT: v_min_u32_e32 v3, v3, v0 +; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[1:2], v3 ; GCN-IR-NEXT: s_mov_b32 s4, 0x47000000 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v1 -; GCN-IR-NEXT: v_or_b32_e32 v1, 1, v1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: v_mul_f32_e32 v2, 0x47000000, v2 -; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-IR-NEXT: v_mad_f32 v3, -v2, v0, s4 -; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_min_u32_e32 v0, 1, v0 +; GCN-IR-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GCN-IR-NEXT: v_sub_i32_e32 v1, vcc, 32, v3 +; GCN-IR-NEXT: v_or_b32_e32 v2, 1, v2 +; GCN-IR-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GCN-IR-NEXT: v_rcp_f32_e32 v1, v0 +; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x47000000, v1 +; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-IR-NEXT: v_mad_f32 v3, -v1, v0, s4 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v0| -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: s_setpc_b64 s[30:31] @@ -2013,11 +2305,22 @@ define i64 @v_test_sdiv24_pow2_k_den_i64(i64 %x) { ; GCN-IR-LABEL: v_test_sdiv24_pow2_k_den_i64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IR-NEXT: v_ashrrev_i32_e32 v0, 8, v1 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 8, v1 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v1, v2 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GCN-IR-NEXT: v_ffbh_i32_e32 v3, v2 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, -1, v3 +; GCN-IR-NEXT: v_min_u32_e32 v3, v3, v0 +; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[1:2], v3 ; GCN-IR-NEXT: s_mov_b32 s4, 0x47000000 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v1 -; GCN-IR-NEXT: v_or_b32_e32 v1, 1, v1 +; GCN-IR-NEXT: v_min_u32_e32 v0, 1, v0 +; GCN-IR-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GCN-IR-NEXT: v_or_b32_e32 v1, 1, v2 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, 32, v3 +; GCN-IR-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; GCN-IR-NEXT: v_mul_f32_e32 v2, 0x38000000, v0 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v0, -v2, s4, v0 diff --git a/llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll b/llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll index 49dec15f9f7d7..173a2edcf7587 100644 --- a/llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll +++ b/llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll @@ -11,65 +11,108 @@ define amdgpu_kernel void @v_sext_in_reg_i8_i16_shuffle_vector(ptr addrspace(1) ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b64 v[1:2], v0, s[2:3] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h -; GFX11-TRUE16-NEXT: v_bfe_i32 v7, v2, 0, 8 -; GFX11-TRUE16-NEXT: v_ashrrev_i32_e32 v5, 24, v2 -; GFX11-TRUE16-NEXT: v_ashrrev_i32_e32 v6, 24, v1 -; GFX11-TRUE16-NEXT: v_bfe_i32 v8, v3, 0, 8 -; GFX11-TRUE16-NEXT: v_ashrrev_i16 v0.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_ashrrev_i32_e32 v4, 31, v2 +; GFX11-TRUE16-NEXT: v_ashrrev_i32_e32 v3, 24, v2 +; GFX11-TRUE16-NEXT: v_bfe_i32 v8, v2, 0, 8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.h +; GFX11-TRUE16-NEXT: v_ashrrev_i32_e32 v7, 24, v1 +; GFX11-TRUE16-NEXT: v_cls_i32_e32 v5, v4 +; GFX11-TRUE16-NEXT: v_xor_b32_e32 v0, v3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_i32 v6, v6, 0, 8 +; GFX11-TRUE16-NEXT: v_cvt_f32_i32_e32 v7, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, -1, v5 +; GFX11-TRUE16-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 32, v0 +; GFX11-TRUE16-NEXT: v_min_u32_e32 v5, v5, v0 +; GFX11-TRUE16-NEXT: v_ashrrev_i16 v0.l, 8, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: v_ashrrev_i16 v0.h, 8, v1.l ; GFX11-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX11-TRUE16-NEXT: v_ashrrev_i16 v0.h, 8, v2.l -; GFX11-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b64 v[3:4], v5, v[3:4] ; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: v_bfe_i32 v9, v2, 0, 8 ; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v0.h, v0.h -; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v2.h, v6.l -; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v4.h, v5.l ; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v1.l, v1.l +; GFX11-TRUE16-NEXT: v_min_u32_e32 v3, 1, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v4, 32, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.l +; GFX11-TRUE16-NEXT: v_cvt_f32_i32_e32 v5, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v8.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v2.h, v3.l +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v3, v0.h, v1.l +; GFX11-TRUE16-NEXT: v_ldexp_f32 v5, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l ; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v1.h, v2.l -; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v2.l, v3.l +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v4.h, v5 ; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v4.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v3, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v1, v0.h, v1.h -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v2, v2.h, v2.l -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v4.h, v4.l +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v1, v0.l, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v4.h, v2.h +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v2, v2.l, v4.l ; GFX11-TRUE16-NEXT: global_store_b128 v5, v[0:3], s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: v_sext_in_reg_i8_i16_shuffle_vector: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, 0 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v3, 31, v1 ; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v2, 24, v1 -; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v5, 24, v0 -; GFX11-FAKE16-NEXT: v_ashrrev_i16 v6, 8, v1 -; GFX11-FAKE16-NEXT: v_bfe_i32 v7, v0, 0, 8 -; GFX11-FAKE16-NEXT: v_ashrrev_i16 v0, 8, v0 +; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v6, 24, v0 +; GFX11-FAKE16-NEXT: v_ashrrev_i16 v7, 8, v1 +; GFX11-FAKE16-NEXT: v_bfe_i32 v8, v1, 0, 8 +; GFX11-FAKE16-NEXT: v_cls_i32_e32 v5, v3 +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v4, v2, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_ashrrev_i16 v9, 8, v0 +; GFX11-FAKE16-NEXT: v_cvt_f32_i32_e32 v6, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, -1, v5 +; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v4, 31, v4 ; GFX11-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX11-FAKE16-NEXT: v_bfe_i32 v3, v3, 0, 8 -; GFX11-FAKE16-NEXT: v_bfe_i32 v4, v4, 0, 8 ; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v7, v7 +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 32, v4 +; GFX11-FAKE16-NEXT: v_min_u32_e32 v4, v5, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX11-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] +; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v4, 32, v4 ; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v0, v0 -; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v1, v1 -; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v6, v6 -; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v5, v5 -; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v8, v2 -; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v2, v4 -; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v4, v3 -; GFX11-FAKE16-NEXT: v_pack_b32_f16 v3, v0, v7 -; GFX11-FAKE16-NEXT: v_pack_b32_f16 v1, v6, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_pack_b32_f16 v2, v5, v2 -; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v8, v4 -; GFX11-FAKE16-NEXT: global_store_b128 v9, v[0:3], s[0:1] +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-FAKE16-NEXT: v_bfe_i32 v3, v5, 0, 8 +; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v5, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v9, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_pack_b32_f16 v3, v5, v0 +; GFX11-FAKE16-NEXT: v_ldexp_f32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v4, v8 +; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v8, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-FAKE16-NEXT: v_pack_b32_f16 v1, v7, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v2, v8 +; GFX11-FAKE16-NEXT: v_pack_b32_f16 v2, v6, v9 +; GFX11-FAKE16-NEXT: global_store_b128 v10, v[0:3], s[0:1] ; GFX11-FAKE16-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <{ [0 x i8] }>, ptr addrspace(1) %ptr, i64 0, i32 0, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index 47dfa9f4fc2d3..88daebb760794 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -441,74 +441,120 @@ define i64 @v_test_srem(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_srem23_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem23_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dword s2, s[4:5], 0xe -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_ashr_i32 s3, s3, 9 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s6, s[4:5], 0xe +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i32 s2, s2, 9 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s3 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_xor_b32 s0, s3, s2 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_ashr_i32 s0, s0, 30 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_or_b32 s8, s0, 1 +; GCN-NEXT: s_ashr_i32 s4, s11, 9 +; GCN-NEXT: s_ashr_i32 s7, s6, 31 +; GCN-NEXT: s_ashr_i32 s6, s6, 9 +; GCN-NEXT: s_mov_b32 s0, s8 +; GCN-NEXT: s_ashr_i32 s5, s11, 31 +; GCN-NEXT: s_xor_b32 s8, s4, s6 +; GCN-NEXT: s_ashr_i32 s12, s8, 30 +; GCN-NEXT: s_xor_b32 s8, s4, s5 +; GCN-NEXT: s_mov_b32 s1, s9 +; GCN-NEXT: s_ashr_i32 s8, s8, 31 +; GCN-NEXT: s_flbit_i32 s9, s5 +; GCN-NEXT: s_add_i32 s8, s8, 32 +; GCN-NEXT: s_add_i32 s9, s9, -1 +; GCN-NEXT: s_min_u32 s13, s9, s8 +; GCN-NEXT: s_lshl_b64 s[8:9], s[4:5], s13 +; GCN-NEXT: s_min_u32 s5, s8, 1 +; GCN-NEXT: s_xor_b32 s8, s6, s7 +; GCN-NEXT: s_ashr_i32 s8, s8, 31 +; GCN-NEXT: s_flbit_i32 s10, s7 +; GCN-NEXT: s_add_i32 s8, s8, 32 +; GCN-NEXT: s_add_i32 s10, s10, -1 +; GCN-NEXT: s_min_u32 s8, s10, s8 +; GCN-NEXT: s_lshl_b64 s[10:11], s[6:7], s8 +; GCN-NEXT: s_min_u32 s7, s10, 1 +; GCN-NEXT: s_or_b32 s7, s11, s7 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s7 +; GCN-NEXT: s_or_b32 s5, s9, s5 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s5 +; GCN-NEXT: s_sub_i32 s5, 32, s8 +; GCN-NEXT: v_ldexp_f32_e64 v0, v0, s5 +; GCN-NEXT: v_rcp_f32_e32 v2, v0 +; GCN-NEXT: s_sub_i32 s7, 32, s13 +; GCN-NEXT: v_ldexp_f32_e64 v1, v1, s7 +; GCN-NEXT: s_or_b32 s5, s12, 1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| -; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN-NEXT: s_cselect_b32 s0, s8, 0 -; GCN-NEXT: v_readfirstlane_b32 s1, v2 -; GCN-NEXT: s_add_i32 s0, s1, s0 -; GCN-NEXT: s_mul_i32 s0, s0, s2 -; GCN-NEXT: s_sub_i32 s0, s3, s0 -; GCN-NEXT: s_bfe_i32 s0, s0, 0x170000 -; GCN-NEXT: s_ashr_i32 s1, s0, 31 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, |v0| +; GCN-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GCN-NEXT: s_cselect_b32 s5, s5, 0 +; GCN-NEXT: v_readfirstlane_b32 s7, v2 +; GCN-NEXT: s_add_i32 s5, s7, s5 +; GCN-NEXT: s_mul_i32 s5, s5, s6 +; GCN-NEXT: s_sub_i32 s4, s4, s5 +; GCN-NEXT: s_bfe_i32 s4, s4, 0x170000 +; GCN-NEXT: s_ashr_i32 s5, s4, 31 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_srem23_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_load_dword s2, s[4:5], 0xe -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: s_ashr_i32 s3, s3, 9 +; GCN-IR-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GCN-IR-NEXT: s_load_dword s6, s[4:5], 0xe +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i32 s2, s2, 9 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s3 -; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_xor_b32 s0, s3, s2 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_ashr_i32 s0, s0, 30 -; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: s_or_b32 s8, s0, 1 +; GCN-IR-NEXT: s_ashr_i32 s4, s11, 9 +; GCN-IR-NEXT: s_ashr_i32 s7, s6, 31 +; GCN-IR-NEXT: s_ashr_i32 s6, s6, 9 +; GCN-IR-NEXT: s_mov_b32 s0, s8 +; GCN-IR-NEXT: s_ashr_i32 s5, s11, 31 +; GCN-IR-NEXT: s_xor_b32 s8, s4, s6 +; GCN-IR-NEXT: s_ashr_i32 s12, s8, 30 +; GCN-IR-NEXT: s_xor_b32 s8, s4, s5 +; GCN-IR-NEXT: s_mov_b32 s1, s9 +; GCN-IR-NEXT: s_ashr_i32 s8, s8, 31 +; GCN-IR-NEXT: s_flbit_i32 s9, s5 +; GCN-IR-NEXT: s_add_i32 s8, s8, 32 +; GCN-IR-NEXT: s_add_i32 s9, s9, -1 +; GCN-IR-NEXT: s_min_u32 s13, s9, s8 +; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[4:5], s13 +; GCN-IR-NEXT: s_min_u32 s5, s8, 1 +; GCN-IR-NEXT: s_xor_b32 s8, s6, s7 +; GCN-IR-NEXT: s_ashr_i32 s8, s8, 31 +; GCN-IR-NEXT: s_flbit_i32 s10, s7 +; GCN-IR-NEXT: s_add_i32 s8, s8, 32 +; GCN-IR-NEXT: s_add_i32 s10, s10, -1 +; GCN-IR-NEXT: s_min_u32 s8, s10, s8 +; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[6:7], s8 +; GCN-IR-NEXT: s_min_u32 s7, s10, 1 +; GCN-IR-NEXT: s_or_b32 s7, s11, s7 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s7 +; GCN-IR-NEXT: s_or_b32 s5, s9, s5 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s5 +; GCN-IR-NEXT: s_sub_i32 s5, 32, s8 +; GCN-IR-NEXT: v_ldexp_f32_e64 v0, v0, s5 +; GCN-IR-NEXT: v_rcp_f32_e32 v2, v0 +; GCN-IR-NEXT: s_sub_i32 s7, 32, s13 +; GCN-IR-NEXT: v_ldexp_f32_e64 v1, v1, s7 +; GCN-IR-NEXT: s_or_b32 s5, s12, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| -; GCN-IR-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN-IR-NEXT: s_cselect_b32 s0, s8, 0 -; GCN-IR-NEXT: v_readfirstlane_b32 s1, v2 -; GCN-IR-NEXT: s_add_i32 s0, s1, s0 -; GCN-IR-NEXT: s_mul_i32 s0, s0, s2 -; GCN-IR-NEXT: s_sub_i32 s0, s3, s0 -; GCN-IR-NEXT: s_bfe_i32 s0, s0, 0x170000 -; GCN-IR-NEXT: s_ashr_i32 s1, s0, 31 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s1 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, |v0| +; GCN-IR-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GCN-IR-NEXT: s_cselect_b32 s5, s5, 0 +; GCN-IR-NEXT: v_readfirstlane_b32 s7, v2 +; GCN-IR-NEXT: s_add_i32 s5, s7, s5 +; GCN-IR-NEXT: s_mul_i32 s5, s5, s6 +; GCN-IR-NEXT: s_sub_i32 s4, s4, s5 +; GCN-IR-NEXT: s_bfe_i32 s4, s4, 0x170000 +; GCN-IR-NEXT: s_ashr_i32 s5, s4, 31 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s5 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i64 %x, 41 %2 = ashr i64 %y, 41 @@ -520,74 +566,120 @@ define amdgpu_kernel void @s_test_srem23_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_srem24_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem24_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dword s2, s[4:5], 0xe -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_ashr_i32 s3, s3, 8 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s6, s[4:5], 0xe +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i32 s2, s2, 8 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s3 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_xor_b32 s0, s3, s2 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_ashr_i32 s0, s0, 30 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_or_b32 s8, s0, 1 +; GCN-NEXT: s_ashr_i32 s4, s11, 8 +; GCN-NEXT: s_ashr_i32 s7, s6, 31 +; GCN-NEXT: s_ashr_i32 s6, s6, 8 +; GCN-NEXT: s_mov_b32 s0, s8 +; GCN-NEXT: s_ashr_i32 s5, s11, 31 +; GCN-NEXT: s_xor_b32 s8, s4, s6 +; GCN-NEXT: s_ashr_i32 s12, s8, 30 +; GCN-NEXT: s_xor_b32 s8, s4, s5 +; GCN-NEXT: s_mov_b32 s1, s9 +; GCN-NEXT: s_ashr_i32 s8, s8, 31 +; GCN-NEXT: s_flbit_i32 s9, s5 +; GCN-NEXT: s_add_i32 s8, s8, 32 +; GCN-NEXT: s_add_i32 s9, s9, -1 +; GCN-NEXT: s_min_u32 s13, s9, s8 +; GCN-NEXT: s_lshl_b64 s[8:9], s[4:5], s13 +; GCN-NEXT: s_min_u32 s5, s8, 1 +; GCN-NEXT: s_xor_b32 s8, s6, s7 +; GCN-NEXT: s_ashr_i32 s8, s8, 31 +; GCN-NEXT: s_flbit_i32 s10, s7 +; GCN-NEXT: s_add_i32 s8, s8, 32 +; GCN-NEXT: s_add_i32 s10, s10, -1 +; GCN-NEXT: s_min_u32 s8, s10, s8 +; GCN-NEXT: s_lshl_b64 s[10:11], s[6:7], s8 +; GCN-NEXT: s_min_u32 s7, s10, 1 +; GCN-NEXT: s_or_b32 s7, s11, s7 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s7 +; GCN-NEXT: s_or_b32 s5, s9, s5 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s5 +; GCN-NEXT: s_sub_i32 s5, 32, s8 +; GCN-NEXT: v_ldexp_f32_e64 v0, v0, s5 +; GCN-NEXT: v_rcp_f32_e32 v2, v0 +; GCN-NEXT: s_sub_i32 s7, 32, s13 +; GCN-NEXT: v_ldexp_f32_e64 v1, v1, s7 +; GCN-NEXT: s_or_b32 s5, s12, 1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| -; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN-NEXT: s_cselect_b32 s0, s8, 0 -; GCN-NEXT: v_readfirstlane_b32 s1, v2 -; GCN-NEXT: s_add_i32 s0, s1, s0 -; GCN-NEXT: s_mul_i32 s0, s0, s2 -; GCN-NEXT: s_sub_i32 s0, s3, s0 -; GCN-NEXT: s_bfe_i32 s0, s0, 0x180000 -; GCN-NEXT: s_ashr_i32 s1, s0, 31 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, |v0| +; GCN-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GCN-NEXT: s_cselect_b32 s5, s5, 0 +; GCN-NEXT: v_readfirstlane_b32 s7, v2 +; GCN-NEXT: s_add_i32 s5, s7, s5 +; GCN-NEXT: s_mul_i32 s5, s5, s6 +; GCN-NEXT: s_sub_i32 s4, s4, s5 +; GCN-NEXT: s_bfe_i32 s4, s4, 0x180000 +; GCN-NEXT: s_ashr_i32 s5, s4, 31 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_srem24_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_load_dword s2, s[4:5], 0xe -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: s_ashr_i32 s3, s3, 8 +; GCN-IR-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GCN-IR-NEXT: s_load_dword s6, s[4:5], 0xe +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i32 s2, s2, 8 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s3 -; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_xor_b32 s0, s3, s2 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_ashr_i32 s0, s0, 30 -; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: s_or_b32 s8, s0, 1 +; GCN-IR-NEXT: s_ashr_i32 s4, s11, 8 +; GCN-IR-NEXT: s_ashr_i32 s7, s6, 31 +; GCN-IR-NEXT: s_ashr_i32 s6, s6, 8 +; GCN-IR-NEXT: s_mov_b32 s0, s8 +; GCN-IR-NEXT: s_ashr_i32 s5, s11, 31 +; GCN-IR-NEXT: s_xor_b32 s8, s4, s6 +; GCN-IR-NEXT: s_ashr_i32 s12, s8, 30 +; GCN-IR-NEXT: s_xor_b32 s8, s4, s5 +; GCN-IR-NEXT: s_mov_b32 s1, s9 +; GCN-IR-NEXT: s_ashr_i32 s8, s8, 31 +; GCN-IR-NEXT: s_flbit_i32 s9, s5 +; GCN-IR-NEXT: s_add_i32 s8, s8, 32 +; GCN-IR-NEXT: s_add_i32 s9, s9, -1 +; GCN-IR-NEXT: s_min_u32 s13, s9, s8 +; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[4:5], s13 +; GCN-IR-NEXT: s_min_u32 s5, s8, 1 +; GCN-IR-NEXT: s_xor_b32 s8, s6, s7 +; GCN-IR-NEXT: s_ashr_i32 s8, s8, 31 +; GCN-IR-NEXT: s_flbit_i32 s10, s7 +; GCN-IR-NEXT: s_add_i32 s8, s8, 32 +; GCN-IR-NEXT: s_add_i32 s10, s10, -1 +; GCN-IR-NEXT: s_min_u32 s8, s10, s8 +; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[6:7], s8 +; GCN-IR-NEXT: s_min_u32 s7, s10, 1 +; GCN-IR-NEXT: s_or_b32 s7, s11, s7 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s7 +; GCN-IR-NEXT: s_or_b32 s5, s9, s5 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s5 +; GCN-IR-NEXT: s_sub_i32 s5, 32, s8 +; GCN-IR-NEXT: v_ldexp_f32_e64 v0, v0, s5 +; GCN-IR-NEXT: v_rcp_f32_e32 v2, v0 +; GCN-IR-NEXT: s_sub_i32 s7, 32, s13 +; GCN-IR-NEXT: v_ldexp_f32_e64 v1, v1, s7 +; GCN-IR-NEXT: s_or_b32 s5, s12, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| -; GCN-IR-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN-IR-NEXT: s_cselect_b32 s0, s8, 0 -; GCN-IR-NEXT: v_readfirstlane_b32 s1, v2 -; GCN-IR-NEXT: s_add_i32 s0, s1, s0 -; GCN-IR-NEXT: s_mul_i32 s0, s0, s2 -; GCN-IR-NEXT: s_sub_i32 s0, s3, s0 -; GCN-IR-NEXT: s_bfe_i32 s0, s0, 0x180000 -; GCN-IR-NEXT: s_ashr_i32 s1, s0, 31 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s1 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, |v0| +; GCN-IR-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GCN-IR-NEXT: s_cselect_b32 s5, s5, 0 +; GCN-IR-NEXT: v_readfirstlane_b32 s7, v2 +; GCN-IR-NEXT: s_add_i32 s5, s7, s5 +; GCN-IR-NEXT: s_mul_i32 s5, s5, s6 +; GCN-IR-NEXT: s_sub_i32 s4, s4, s5 +; GCN-IR-NEXT: s_bfe_i32 s4, s4, 0x180000 +; GCN-IR-NEXT: s_ashr_i32 s5, s4, 31 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s5 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i64 %x, 40 %2 = ashr i64 %y, 40 @@ -600,22 +692,46 @@ define i64 @v_test_srem24_64(i64 %x, i64 %y) { ; GCN-LABEL: v_test_srem24_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_ashrrev_i32_e32 v0, 8, v3 -; GCN-NEXT: v_cvt_f32_i32_e32 v2, v0 +; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 8, v1 -; GCN-NEXT: v_cvt_f32_i32_e32 v3, v1 -; GCN-NEXT: v_xor_b32_e32 v5, v1, v0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GCN-NEXT: v_ashrrev_i32_e32 v5, 30, v5 -; GCN-NEXT: v_or_b32_e32 v5, 1, v5 -; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 -; GCN-NEXT: v_trunc_f32_e32 v4, v4 -; GCN-NEXT: v_mad_f32 v3, -v4, v2, v3 -; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| -; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GCN-NEXT: v_mul_lo_u32 v0, v2, v0 +; GCN-NEXT: v_xor_b32_e32 v5, v1, v2 +; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v5 +; GCN-NEXT: v_ffbh_i32_e32 v6, v2 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 32, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, -1, v6 +; GCN-NEXT: v_min_u32_e32 v7, v6, v5 +; GCN-NEXT: v_lshl_b64 v[5:6], v[1:2], v7 +; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GCN-NEXT: v_ashrrev_i32_e32 v3, 8, v3 +; GCN-NEXT: v_min_u32_e32 v2, 1, v5 +; GCN-NEXT: v_xor_b32_e32 v5, v3, v4 +; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v5 +; GCN-NEXT: v_ffbh_i32_e32 v8, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 32, v5 +; GCN-NEXT: v_add_i32_e32 v8, vcc, -1, v8 +; GCN-NEXT: v_min_u32_e32 v8, v8, v5 +; GCN-NEXT: v_lshl_b64 v[4:5], v[3:4], v8 +; GCN-NEXT: v_or_b32_e32 v2, v6, v2 +; GCN-NEXT: v_min_u32_e32 v4, 1, v4 +; GCN-NEXT: v_or_b32_e32 v4, v5, v4 +; GCN-NEXT: v_cvt_f32_i32_e32 v4, v4 +; GCN-NEXT: v_sub_i32_e32 v5, vcc, 32, v8 +; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GCN-NEXT: v_ldexp_f32_e32 v4, v4, v5 +; GCN-NEXT: v_rcp_f32_e32 v5, v4 +; GCN-NEXT: v_sub_i32_e32 v6, vcc, 32, v7 +; GCN-NEXT: v_ldexp_f32_e32 v2, v2, v6 +; GCN-NEXT: v_mul_f32_e32 v5, v2, v5 +; GCN-NEXT: v_trunc_f32_e32 v5, v5 +; GCN-NEXT: v_xor_b32_e32 v0, v1, v3 +; GCN-NEXT: v_mad_f32 v2, -v5, v4, v2 +; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 +; GCN-NEXT: v_ashrrev_i32_e32 v0, 30, v0 +; GCN-NEXT: v_or_b32_e32 v0, 1, v0 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v4| +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GCN-NEXT: v_mul_lo_u32 v0, v0, v3 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -624,22 +740,46 @@ define i64 @v_test_srem24_64(i64 %x, i64 %y) { ; GCN-IR-LABEL: v_test_srem24_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IR-NEXT: v_ashrrev_i32_e32 v0, 8, v3 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v2, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 8, v1 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v3, v1 -; GCN-IR-NEXT: v_xor_b32_e32 v5, v1, v0 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v5, 30, v5 -; GCN-IR-NEXT: v_or_b32_e32 v5, 1, v5 -; GCN-IR-NEXT: v_mul_f32_e32 v4, v3, v4 -; GCN-IR-NEXT: v_trunc_f32_e32 v4, v4 -; GCN-IR-NEXT: v_mad_f32 v3, -v4, v2, v3 -; GCN-IR-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| -; GCN-IR-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v2, v0 +; GCN-IR-NEXT: v_xor_b32_e32 v5, v1, v2 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v5, 31, v5 +; GCN-IR-NEXT: v_ffbh_i32_e32 v6, v2 +; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, 32, v5 +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, -1, v6 +; GCN-IR-NEXT: v_min_u32_e32 v7, v6, v5 +; GCN-IR-NEXT: v_lshl_b64 v[5:6], v[1:2], v7 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v3, 8, v3 +; GCN-IR-NEXT: v_min_u32_e32 v2, 1, v5 +; GCN-IR-NEXT: v_xor_b32_e32 v5, v3, v4 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v5, 31, v5 +; GCN-IR-NEXT: v_ffbh_i32_e32 v8, v4 +; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, 32, v5 +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, -1, v8 +; GCN-IR-NEXT: v_min_u32_e32 v8, v8, v5 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[3:4], v8 +; GCN-IR-NEXT: v_or_b32_e32 v2, v6, v2 +; GCN-IR-NEXT: v_min_u32_e32 v4, 1, v4 +; GCN-IR-NEXT: v_or_b32_e32 v4, v5, v4 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v4, v4 +; GCN-IR-NEXT: v_sub_i32_e32 v5, vcc, 32, v8 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GCN-IR-NEXT: v_ldexp_f32_e32 v4, v4, v5 +; GCN-IR-NEXT: v_rcp_f32_e32 v5, v4 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 32, v7 +; GCN-IR-NEXT: v_ldexp_f32_e32 v2, v2, v6 +; GCN-IR-NEXT: v_mul_f32_e32 v5, v2, v5 +; GCN-IR-NEXT: v_trunc_f32_e32 v5, v5 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v1, v3 +; GCN-IR-NEXT: v_mad_f32 v2, -v5, v4, v2 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v5, v5 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v0, 30, v0 +; GCN-IR-NEXT: v_or_b32_e32 v0, 1, v0 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v4| +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v3 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -1917,66 +2057,88 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) { define amdgpu_kernel void @s_test_srem24_k_num_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_srem24_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s8, 0x41c00000 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i32 s2, s3, 8 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_ashr_i32 s0, s3, 31 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GCN-NEXT: s_or_b32 s3, s0, 1 +; GCN-NEXT: s_ashr_i32 s9, s7, 31 +; GCN-NEXT: s_ashr_i32 s8, s7, 8 +; GCN-NEXT: s_xor_b32 s0, s8, s9 +; GCN-NEXT: s_flbit_i32 s1, s9 +; GCN-NEXT: s_ashr_i32 s0, s0, 31 +; GCN-NEXT: s_add_i32 s1, s1, -1 +; GCN-NEXT: s_add_i32 s0, s0, 32 +; GCN-NEXT: s_min_u32 s6, s1, s0 +; GCN-NEXT: s_lshl_b64 s[0:1], s[8:9], s6 +; GCN-NEXT: s_min_u32 s0, s0, 1 +; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GCN-NEXT: s_sub_i32 s0, 32, s6 +; GCN-NEXT: s_mov_b32 s7, 0x41c00000 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: v_ldexp_f32_e64 v0, v0, s0 +; GCN-NEXT: v_rcp_f32_e32 v1, v0 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_or_b32 s6, s9, 1 ; GCN-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_mad_f32 v2, -v1, v0, s8 +; GCN-NEXT: v_mad_f32 v2, -v1, v0, s7 ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| -; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN-NEXT: s_cselect_b32 s0, s3, 0 -; GCN-NEXT: v_readfirstlane_b32 s1, v1 -; GCN-NEXT: s_add_i32 s0, s1, s0 -; GCN-NEXT: s_mul_i32 s0, s0, s2 -; GCN-NEXT: s_sub_i32 s0, 24, s0 -; GCN-NEXT: s_bfe_i32 s0, s0, 0x180000 -; GCN-NEXT: s_ashr_i32 s1, s0, 31 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v0| +; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-NEXT: s_cselect_b32 s4, s6, 0 +; GCN-NEXT: v_readfirstlane_b32 s5, v1 +; GCN-NEXT: s_add_i32 s4, s5, s4 +; GCN-NEXT: s_mul_i32 s4, s4, s8 +; GCN-NEXT: s_sub_i32 s4, 24, s4 +; GCN-NEXT: s_bfe_i32 s4, s4, 0x180000 +; GCN-NEXT: s_ashr_i32 s5, s4, 31 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_srem24_k_num_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_mov_b32 s8, 0x41c00000 -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i32 s2, s3, 8 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_ashr_i32 s0, s3, 31 -; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GCN-IR-NEXT: s_or_b32 s3, s0, 1 +; GCN-IR-NEXT: s_ashr_i32 s9, s7, 31 +; GCN-IR-NEXT: s_ashr_i32 s8, s7, 8 +; GCN-IR-NEXT: s_xor_b32 s0, s8, s9 +; GCN-IR-NEXT: s_flbit_i32 s1, s9 +; GCN-IR-NEXT: s_ashr_i32 s0, s0, 31 +; GCN-IR-NEXT: s_add_i32 s1, s1, -1 +; GCN-IR-NEXT: s_add_i32 s0, s0, 32 +; GCN-IR-NEXT: s_min_u32 s6, s1, s0 +; GCN-IR-NEXT: s_lshl_b64 s[0:1], s[8:9], s6 +; GCN-IR-NEXT: s_min_u32 s0, s0, 1 +; GCN-IR-NEXT: s_or_b32 s0, s1, s0 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GCN-IR-NEXT: s_sub_i32 s0, 32, s6 +; GCN-IR-NEXT: s_mov_b32 s7, 0x41c00000 +; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: v_ldexp_f32_e64 v0, v0, s0 +; GCN-IR-NEXT: v_rcp_f32_e32 v1, v0 +; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_or_b32 s6, s9, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-IR-NEXT: v_mad_f32 v2, -v1, v0, s8 +; GCN-IR-NEXT: v_mad_f32 v2, -v1, v0, s7 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| -; GCN-IR-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN-IR-NEXT: s_cselect_b32 s0, s3, 0 -; GCN-IR-NEXT: v_readfirstlane_b32 s1, v1 -; GCN-IR-NEXT: s_add_i32 s0, s1, s0 -; GCN-IR-NEXT: s_mul_i32 s0, s0, s2 -; GCN-IR-NEXT: s_sub_i32 s0, 24, s0 -; GCN-IR-NEXT: s_bfe_i32 s0, s0, 0x180000 -; GCN-IR-NEXT: s_ashr_i32 s1, s0, 31 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s1 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v0| +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_cselect_b32 s4, s6, 0 +; GCN-IR-NEXT: v_readfirstlane_b32 s5, v1 +; GCN-IR-NEXT: s_add_i32 s4, s5, s4 +; GCN-IR-NEXT: s_mul_i32 s4, s4, s8 +; GCN-IR-NEXT: s_sub_i32 s4, 24, s4 +; GCN-IR-NEXT: s_bfe_i32 s4, s4, 0x180000 +; GCN-IR-NEXT: s_ashr_i32 s5, s4, 31 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s5 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %x.shr = ashr i64 %x, 40 %result = srem i64 24, %x.shr @@ -1988,23 +2150,34 @@ define amdgpu_kernel void @s_test_srem24_k_den_i64(ptr addrspace(1) %out, i64 %x ; GCN-LABEL: s_test_srem24_k_den_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s2, 0x46b6fe00 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_ashr_i32 s9, s3, 31 ; GCN-NEXT: s_ashr_i32 s8, s3, 8 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_ashr_i32 s0, s3, 31 +; GCN-NEXT: s_xor_b32 s0, s8, s9 +; GCN-NEXT: s_ashr_i32 s0, s0, 31 +; GCN-NEXT: s_flbit_i32 s2, s9 +; GCN-NEXT: s_add_i32 s0, s0, 32 +; GCN-NEXT: s_add_i32 s2, s2, -1 +; GCN-NEXT: s_min_u32 s0, s2, s0 +; GCN-NEXT: s_lshl_b64 s[2:3], s[8:9], s0 +; GCN-NEXT: s_min_u32 s2, s2, 1 +; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GCN-NEXT: s_sub_i32 s0, 32, s0 ; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_or_b32 s2, s9, 1 +; GCN-NEXT: v_ldexp_f32_e64 v0, v0, s0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_mad_f32 v0, -v1, s2, v0 +; GCN-NEXT: s_mov_b32 s0, 0x46b6fe00 +; GCN-NEXT: v_mad_f32 v0, -v1, s0, v0 ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GCN-NEXT: s_or_b32 s3, s0, 1 -; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v0|, s2 +; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v0|, s0 ; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN-NEXT: s_cselect_b32 s0, s3, 0 +; GCN-NEXT: s_cselect_b32 s0, s2, 0 ; GCN-NEXT: v_readfirstlane_b32 s1, v1 ; GCN-NEXT: s_add_i32 s0, s1, s0 ; GCN-NEXT: s_mulk_i32 s0, 0x5b7f @@ -2019,23 +2192,34 @@ define amdgpu_kernel void @s_test_srem24_k_den_i64(ptr addrspace(1) %out, i64 %x ; GCN-IR-LABEL: s_test_srem24_k_den_i64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_mov_b32 s2, 0x46b6fe00 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_ashr_i32 s9, s3, 31 ; GCN-IR-NEXT: s_ashr_i32 s8, s3, 8 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_ashr_i32 s0, s3, 31 +; GCN-IR-NEXT: s_xor_b32 s0, s8, s9 +; GCN-IR-NEXT: s_ashr_i32 s0, s0, 31 +; GCN-IR-NEXT: s_flbit_i32 s2, s9 +; GCN-IR-NEXT: s_add_i32 s0, s0, 32 +; GCN-IR-NEXT: s_add_i32 s2, s2, -1 +; GCN-IR-NEXT: s_min_u32 s0, s2, s0 +; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[8:9], s0 +; GCN-IR-NEXT: s_min_u32 s2, s2, 1 +; GCN-IR-NEXT: s_or_b32 s2, s3, s2 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GCN-IR-NEXT: s_sub_i32 s0, 32, s0 ; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: s_or_b32 s2, s9, 1 +; GCN-IR-NEXT: v_ldexp_f32_e64 v0, v0, s0 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-IR-NEXT: v_mad_f32 v0, -v1, s2, v0 +; GCN-IR-NEXT: s_mov_b32 s0, 0x46b6fe00 +; GCN-IR-NEXT: v_mad_f32 v0, -v1, s0, v0 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GCN-IR-NEXT: s_or_b32 s3, s0, 1 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[0:1], |v0|, s2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[0:1], |v0|, s0 ; GCN-IR-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN-IR-NEXT: s_cselect_b32 s0, s3, 0 +; GCN-IR-NEXT: s_cselect_b32 s0, s2, 0 ; GCN-IR-NEXT: v_readfirstlane_b32 s1, v1 ; GCN-IR-NEXT: s_add_i32 s0, s1, s0 ; GCN-IR-NEXT: s_mulk_i32 s0, 0x5b7f @@ -2056,20 +2240,31 @@ define i64 @v_test_srem24_k_num_i64(i64 %x) { ; GCN-LABEL: v_test_srem24_k_num_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_ashrrev_i32_e32 v0, 8, v1 -; GCN-NEXT: v_cvt_f32_i32_e32 v2, v0 +; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GCN-NEXT: v_ashrrev_i32_e32 v1, 8, v1 +; GCN-NEXT: v_xor_b32_e32 v0, v1, v2 +; GCN-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GCN-NEXT: v_ffbh_i32_e32 v3, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v3 +; GCN-NEXT: v_min_u32_e32 v0, v3, v0 +; GCN-NEXT: v_lshl_b64 v[3:4], v[1:2], v0 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, 32, v0 +; GCN-NEXT: v_min_u32_e32 v3, 1, v3 +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: v_cvt_f32_i32_e32 v3, v3 ; GCN-NEXT: s_mov_b32 s4, 0x41c00000 -; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v1 -; GCN-NEXT: v_or_b32_e32 v1, 1, v1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v2 +; GCN-NEXT: v_or_b32_e32 v2, 1, v2 +; GCN-NEXT: v_ldexp_f32_e32 v0, v3, v0 +; GCN-NEXT: v_rcp_f32_e32 v3, v0 ; GCN-NEXT: v_mul_f32_e32 v3, 0x41c00000, v3 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 -; GCN-NEXT: v_mad_f32 v4, -v3, v2, s4 +; GCN-NEXT: v_mad_f32 v4, -v3, v0, s4 ; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; GCN-NEXT: v_mul_lo_u32 v0, v1, v0 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v0| +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; GCN-NEXT: v_mul_lo_u32 v0, v0, v1 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -2078,20 +2273,31 @@ define i64 @v_test_srem24_k_num_i64(i64 %x) { ; GCN-IR-LABEL: v_test_srem24_k_num_i64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IR-NEXT: v_ashrrev_i32_e32 v0, 8, v1 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v2, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 8, v1 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v1, v2 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GCN-IR-NEXT: v_ffbh_i32_e32 v3, v2 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, -1, v3 +; GCN-IR-NEXT: v_min_u32_e32 v0, v3, v0 +; GCN-IR-NEXT: v_lshl_b64 v[3:4], v[1:2], v0 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 32, v0 +; GCN-IR-NEXT: v_min_u32_e32 v3, 1, v3 +; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v3, v3 ; GCN-IR-NEXT: s_mov_b32 s4, 0x41c00000 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v1 -; GCN-IR-NEXT: v_or_b32_e32 v1, 1, v1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v3, v2 +; GCN-IR-NEXT: v_or_b32_e32 v2, 1, v2 +; GCN-IR-NEXT: v_ldexp_f32_e32 v0, v3, v0 +; GCN-IR-NEXT: v_rcp_f32_e32 v3, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v3, 0x41c00000, v3 ; GCN-IR-NEXT: v_trunc_f32_e32 v3, v3 -; GCN-IR-NEXT: v_mad_f32 v4, -v3, v2, s4 +; GCN-IR-NEXT: v_mad_f32 v4, -v3, v0, s4 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| -; GCN-IR-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v1, v0 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v0| +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -2105,20 +2311,31 @@ define i64 @v_test_srem24_pow2_k_num_i64(i64 %x) { ; GCN-LABEL: v_test_srem24_pow2_k_num_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_ashrrev_i32_e32 v0, 8, v1 -; GCN-NEXT: v_cvt_f32_i32_e32 v2, v0 +; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GCN-NEXT: v_ashrrev_i32_e32 v1, 8, v1 +; GCN-NEXT: v_xor_b32_e32 v0, v1, v2 +; GCN-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GCN-NEXT: v_ffbh_i32_e32 v3, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v3 +; GCN-NEXT: v_min_u32_e32 v0, v3, v0 +; GCN-NEXT: v_lshl_b64 v[3:4], v[1:2], v0 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, 32, v0 +; GCN-NEXT: v_min_u32_e32 v3, 1, v3 +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: v_cvt_f32_i32_e32 v3, v3 ; GCN-NEXT: s_mov_b32 s4, 0x47000000 -; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v1 -; GCN-NEXT: v_or_b32_e32 v1, 1, v1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v2 +; GCN-NEXT: v_or_b32_e32 v2, 1, v2 +; GCN-NEXT: v_ldexp_f32_e32 v0, v3, v0 +; GCN-NEXT: v_rcp_f32_e32 v3, v0 ; GCN-NEXT: v_mul_f32_e32 v3, 0x47000000, v3 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 -; GCN-NEXT: v_mad_f32 v4, -v3, v2, s4 +; GCN-NEXT: v_mad_f32 v4, -v3, v0, s4 ; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; GCN-NEXT: v_mul_lo_u32 v0, v1, v0 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v0| +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; GCN-NEXT: v_mul_lo_u32 v0, v0, v1 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, 0x8000, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -2127,20 +2344,31 @@ define i64 @v_test_srem24_pow2_k_num_i64(i64 %x) { ; GCN-IR-LABEL: v_test_srem24_pow2_k_num_i64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IR-NEXT: v_ashrrev_i32_e32 v0, 8, v1 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v2, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 8, v1 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v1, v2 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GCN-IR-NEXT: v_ffbh_i32_e32 v3, v2 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, -1, v3 +; GCN-IR-NEXT: v_min_u32_e32 v0, v3, v0 +; GCN-IR-NEXT: v_lshl_b64 v[3:4], v[1:2], v0 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 32, v0 +; GCN-IR-NEXT: v_min_u32_e32 v3, 1, v3 +; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v3, v3 ; GCN-IR-NEXT: s_mov_b32 s4, 0x47000000 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v1 -; GCN-IR-NEXT: v_or_b32_e32 v1, 1, v1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v3, v2 +; GCN-IR-NEXT: v_or_b32_e32 v2, 1, v2 +; GCN-IR-NEXT: v_ldexp_f32_e32 v0, v3, v0 +; GCN-IR-NEXT: v_rcp_f32_e32 v3, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v3, 0x47000000, v3 ; GCN-IR-NEXT: v_trunc_f32_e32 v3, v3 -; GCN-IR-NEXT: v_mad_f32 v4, -v3, v2, s4 +; GCN-IR-NEXT: v_mad_f32 v4, -v3, v0, s4 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| -; GCN-IR-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v1, v0 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v0| +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 0x8000, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -2167,20 +2395,31 @@ define i64 @v_test_srem24_pow2_k_den_i64(i64 %x) { ; GCN-IR-LABEL: v_test_srem24_pow2_k_den_i64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IR-NEXT: v_ashrrev_i32_e32 v0, 8, v1 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v2, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 8, v1 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v1, v2 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GCN-IR-NEXT: v_ffbh_i32_e32 v3, v2 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, -1, v3 +; GCN-IR-NEXT: v_min_u32_e32 v0, v3, v0 +; GCN-IR-NEXT: v_lshl_b64 v[3:4], v[1:2], v0 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 32, v0 +; GCN-IR-NEXT: v_min_u32_e32 v3, 1, v3 +; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v3, v3 ; GCN-IR-NEXT: s_mov_b32 s4, 0x47000000 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v1 -; GCN-IR-NEXT: v_or_b32_e32 v1, 1, v1 -; GCN-IR-NEXT: v_mul_f32_e32 v3, 0x38000000, v2 +; GCN-IR-NEXT: v_or_b32_e32 v2, 1, v2 +; GCN-IR-NEXT: v_ldexp_f32_e32 v0, v3, v0 +; GCN-IR-NEXT: v_mul_f32_e32 v3, 0x38000000, v0 ; GCN-IR-NEXT: v_trunc_f32_e32 v3, v3 -; GCN-IR-NEXT: v_mad_f32 v2, -v3, s4, v2 +; GCN-IR-NEXT: v_mad_f32 v0, -v3, s4, v0 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, s4 -; GCN-IR-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; GCN-IR-NEXT: v_lshlrev_b32_e32 v1, 15, v1 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4 +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; GCN-IR-NEXT: v_lshlrev_b32_e32 v0, 15, v0 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index e9017939f8a4a..cf6ea9a9f1d43 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -397,51 +397,79 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_udiv24_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv24_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[4:5], 0xe ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s2, s6, 8 +; GCN-NEXT: s_load_dword s2, s[4:5], 0xe +; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: s_flbit_i32_b32 s8, 0 +; GCN-NEXT: s_min_u32 s8, s8, 32 +; GCN-NEXT: s_lshr_b32 s4, s3, 8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s2, s2, 8 +; GCN-NEXT: s_mov_b32 s3, s5 +; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 +; GCN-NEXT: s_min_u32 s2, s2, 1 +; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GCN-NEXT: s_lshr_b32 s2, s3, 8 +; GCN-NEXT: s_min_u32 s4, s4, 1 +; GCN-NEXT: s_or_b32 s2, s5, s4 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GCN-NEXT: s_sub_i32 s2, 32, s8 +; GCN-NEXT: v_ldexp_f32_e64 v0, v0, s2 +; GCN-NEXT: v_rcp_f32_e32 v2, v0 +; GCN-NEXT: v_ldexp_f32_e64 v1, v1, s2 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_udiv24_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s6, s[4:5], 0xe ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_lshr_b32 s2, s6, 8 +; GCN-IR-NEXT: s_load_dword s2, s[4:5], 0xe +; GCN-IR-NEXT: s_mov_b32 s5, 0 +; GCN-IR-NEXT: s_flbit_i32_b32 s8, 0 +; GCN-IR-NEXT: s_min_u32 s8, s8, 32 +; GCN-IR-NEXT: s_lshr_b32 s4, s3, 8 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_lshr_b32 s2, s2, 8 +; GCN-IR-NEXT: s_mov_b32 s3, s5 +; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 +; GCN-IR-NEXT: s_min_u32 s2, s2, 1 +; GCN-IR-NEXT: s_or_b32 s2, s3, s2 +; GCN-IR-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GCN-IR-NEXT: s_lshr_b32 s2, s3, 8 +; GCN-IR-NEXT: s_min_u32 s4, s4, 1 +; GCN-IR-NEXT: s_or_b32 s2, s5, s4 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GCN-IR-NEXT: s_sub_i32 s2, 32, s8 +; GCN-IR-NEXT: v_ldexp_f32_e64 v0, v0, s2 +; GCN-IR-NEXT: v_rcp_f32_e32 v2, v0 +; GCN-IR-NEXT: v_ldexp_f32_e64 v1, v1, s2 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %1 = lshr i64 %x, 40 @@ -455,16 +483,29 @@ define i64 @v_test_udiv24_i64(i64 %x, i64 %y) { ; GCN-LABEL: v_test_udiv24_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_flbit_i32_b32 s4, 0 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v3 +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: s_min_u32 s4, s4, 32 +; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], s4 +; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], s4 +; GCN-NEXT: v_min_u32_e32 v2, 1, v2 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GCN-NEXT: v_min_u32_e32 v0, 1, v0 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: s_sub_i32 s4, 32, s4 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 +; GCN-NEXT: v_ldexp_f32_e64 v1, v2, s4 +; GCN-NEXT: v_rcp_f32_e32 v2, v1 +; GCN-NEXT: v_ldexp_f32_e64 v0, v0, s4 +; GCN-NEXT: v_mul_f32_e32 v2, v0, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 -; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 +; GCN-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc ; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0 @@ -473,16 +514,29 @@ define i64 @v_test_udiv24_i64(i64 %x, i64 %y) { ; GCN-IR-LABEL: v_test_udiv24_i64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: s_flbit_i32_b32 s4, 0 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 8, v3 +; GCN-IR-NEXT: v_mov_b32_e32 v3, v1 +; GCN-IR-NEXT: s_min_u32 s4, s4, 32 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], s4 +; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], s4 +; GCN-IR-NEXT: v_min_u32_e32 v2, 1, v2 +; GCN-IR-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GCN-IR-NEXT: v_min_u32_e32 v0, 1, v0 +; GCN-IR-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-IR-NEXT: s_sub_i32 s4, 32, s4 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 +; GCN-IR-NEXT: v_ldexp_f32_e64 v1, v2, s4 +; GCN-IR-NEXT: v_rcp_f32_e32 v2, v1 +; GCN-IR-NEXT: v_ldexp_f32_e64 v0, v0, s4 +; GCN-IR-NEXT: v_mul_f32_e32 v2, v0, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2 -; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 +; GCN-IR-NEXT: v_mad_f32 v0, -v2, v1, v0 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v1 ; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc ; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0 @@ -497,12 +551,20 @@ define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64 ; GCN-LABEL: s_test_udiv32_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s8, s[4:5], 0xe +; GCN-NEXT: s_flbit_i32_b32 s0, 0 +; GCN-NEXT: s_mov_b32 s9, 0 +; GCN-NEXT: s_min_u32 s2, s0, 32 ; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GCN-NEXT: s_lshl_b64 s[0:1], s[8:9], s2 +; GCN-NEXT: s_min_u32 s0, s0, 1 +; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GCN-NEXT: s_sub_i32 s0, 32, s2 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: v_ldexp_f32_e64 v0, v0, s0 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: s_sub_i32 s0, 0, s8 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_mul_lo_u32 v1, s0, v0 @@ -533,12 +595,20 @@ define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64 ; GCN-IR-LABEL: s_test_udiv32_i64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dword s8, s[4:5], 0xe +; GCN-IR-NEXT: s_flbit_i32_b32 s0, 0 +; GCN-IR-NEXT: s_mov_b32 s9, 0 +; GCN-IR-NEXT: s_min_u32 s2, s0, 32 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GCN-IR-NEXT: s_lshl_b64 s[0:1], s[8:9], s2 +; GCN-IR-NEXT: s_min_u32 s0, s0, 1 +; GCN-IR-NEXT: s_or_b32 s0, s1, s0 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GCN-IR-NEXT: s_sub_i32 s0, 32, s2 +; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: v_ldexp_f32_e64 v0, v0, s0 +; GCN-IR-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-IR-NEXT: s_sub_i32 s0, 0, s8 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v1, s0, v0 @@ -576,13 +646,21 @@ define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64 ; GCN-LABEL: s_test_udiv31_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s0, s[4:5], 0xe +; GCN-NEXT: s_flbit_i32_b32 s1, 0 +; GCN-NEXT: s_mov_b32 s9, 0 +; GCN-NEXT: s_min_u32 s2, s1, 32 ; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s8, s0, 1 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GCN-NEXT: s_lshl_b64 s[0:1], s[8:9], s2 +; GCN-NEXT: s_min_u32 s0, s0, 1 +; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GCN-NEXT: s_sub_i32 s0, 32, s2 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: v_ldexp_f32_e64 v0, v0, s0 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: s_sub_i32 s0, 0, s8 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_mul_lo_u32 v1, s0, v0 @@ -614,13 +692,21 @@ define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64 ; GCN-IR-LABEL: s_test_udiv31_i64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dword s0, s[4:5], 0xe +; GCN-IR-NEXT: s_flbit_i32_b32 s1, 0 +; GCN-IR-NEXT: s_mov_b32 s9, 0 +; GCN-IR-NEXT: s_min_u32 s2, s1, 32 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_lshr_b32 s8, s0, 1 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GCN-IR-NEXT: s_lshl_b64 s[0:1], s[8:9], s2 +; GCN-IR-NEXT: s_min_u32 s0, s0, 1 +; GCN-IR-NEXT: s_or_b32 s0, s1, s0 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GCN-IR-NEXT: s_sub_i32 s0, 32, s2 +; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: v_ldexp_f32_e64 v0, v0, s0 +; GCN-IR-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-IR-NEXT: s_sub_i32 s0, 0, s8 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v1, s0, v0 @@ -658,51 +744,79 @@ define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64 define amdgpu_kernel void @s_test_udiv23_i64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv23_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[4:5], 0xe ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s2, s6, 9 +; GCN-NEXT: s_load_dword s2, s[4:5], 0xe +; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: s_flbit_i32_b32 s8, 0 +; GCN-NEXT: s_min_u32 s8, s8, 32 +; GCN-NEXT: s_lshr_b32 s4, s3, 9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s2, s2, 9 +; GCN-NEXT: s_mov_b32 s3, s5 +; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 +; GCN-NEXT: s_min_u32 s2, s2, 1 +; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GCN-NEXT: s_lshr_b32 s2, s3, 9 +; GCN-NEXT: s_min_u32 s4, s4, 1 +; GCN-NEXT: s_or_b32 s2, s5, s4 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GCN-NEXT: s_sub_i32 s2, 32, s8 +; GCN-NEXT: v_ldexp_f32_e64 v0, v0, s2 +; GCN-NEXT: v_rcp_f32_e32 v2, v0 +; GCN-NEXT: v_ldexp_f32_e64 v1, v1, s2 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_and_b32_e32 v0, 0x7fffff, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_udiv23_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s6, s[4:5], 0xe ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_lshr_b32 s2, s6, 9 +; GCN-IR-NEXT: s_load_dword s2, s[4:5], 0xe +; GCN-IR-NEXT: s_mov_b32 s5, 0 +; GCN-IR-NEXT: s_flbit_i32_b32 s8, 0 +; GCN-IR-NEXT: s_min_u32 s8, s8, 32 +; GCN-IR-NEXT: s_lshr_b32 s4, s3, 9 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_lshr_b32 s2, s2, 9 +; GCN-IR-NEXT: s_mov_b32 s3, s5 +; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 +; GCN-IR-NEXT: s_min_u32 s2, s2, 1 +; GCN-IR-NEXT: s_or_b32 s2, s3, s2 +; GCN-IR-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GCN-IR-NEXT: s_lshr_b32 s2, s3, 9 +; GCN-IR-NEXT: s_min_u32 s4, s4, 1 +; GCN-IR-NEXT: s_or_b32 s2, s5, s4 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GCN-IR-NEXT: s_sub_i32 s2, 32, s8 +; GCN-IR-NEXT: v_ldexp_f32_e64 v0, v0, s2 +; GCN-IR-NEXT: v_rcp_f32_e32 v2, v0 +; GCN-IR-NEXT: v_ldexp_f32_e64 v1, v1, s2 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_and_b32_e32 v0, 0x7fffff, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %1 = lshr i64 %x, 41 @@ -715,25 +829,37 @@ define amdgpu_kernel void @s_test_udiv23_i64(ptr addrspace(1) %out, i64 %x, i64 define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 %y) { ; GCN-LABEL: s_test_udiv24_i48: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s2, s2, 0xff000000 -; GCN-NEXT: s_and_b32 s4, s4, 0xff000000 +; GCN-NEXT: s_and_b32 s7, s11, 0xffff +; GCN-NEXT: s_and_b32 s6, s10, 0xff000000 +; GCN-NEXT: s_lshr_b64 s[6:7], s[6:7], 24 +; GCN-NEXT: s_flbit_i32_b32 s1, s7 +; GCN-NEXT: s_mov_b32 s0, s8 ; GCN-NEXT: s_and_b32 s5, s5, 0xffff -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_alignbit_b32 v0, s5, v0, 24 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GCN-NEXT: s_and_b32 s3, s3, 0xffff -; GCN-NEXT: v_mov_b32_e32 v1, s2 -; GCN-NEXT: v_alignbit_b32 v1, s3, v1, 24 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_and_b32 s4, s4, 0xff000000 +; GCN-NEXT: s_min_u32 s8, s1, 32 +; GCN-NEXT: s_lshr_b64 s[4:5], s[4:5], 24 +; GCN-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 +; GCN-NEXT: s_min_u32 s1, s6, 1 +; GCN-NEXT: s_flbit_i32_b32 s6, s5 +; GCN-NEXT: s_min_u32 s6, s6, 32 +; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], s6 +; GCN-NEXT: s_min_u32 s4, s4, 1 +; GCN-NEXT: s_or_b32 s4, s5, s4 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GCN-NEXT: s_or_b32 s1, s7, s1 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s1 +; GCN-NEXT: s_sub_i32 s1, 32, s6 +; GCN-NEXT: v_ldexp_f32_e64 v0, v0, s1 +; GCN-NEXT: v_rcp_f32_e32 v2, v0 +; GCN-NEXT: s_sub_i32 s4, 32, s8 +; GCN-NEXT: v_ldexp_f32_e64 v1, v1, s4 +; GCN-NEXT: s_mov_b32 s1, s9 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 @@ -741,31 +867,43 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc ; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GCN-NEXT: buffer_store_short v3, off, s[4:7], 0 offset:4 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_short v3, off, s[0:3], 0 offset:4 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_udiv24_i48: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_and_b32 s2, s2, 0xff000000 -; GCN-IR-NEXT: s_and_b32 s4, s4, 0xff000000 +; GCN-IR-NEXT: s_and_b32 s7, s11, 0xffff +; GCN-IR-NEXT: s_and_b32 s6, s10, 0xff000000 +; GCN-IR-NEXT: s_lshr_b64 s[6:7], s[6:7], 24 +; GCN-IR-NEXT: s_flbit_i32_b32 s1, s7 +; GCN-IR-NEXT: s_mov_b32 s0, s8 ; GCN-IR-NEXT: s_and_b32 s5, s5, 0xffff -; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 -; GCN-IR-NEXT: v_alignbit_b32 v0, s5, v0, 24 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GCN-IR-NEXT: s_and_b32 s3, s3, 0xffff -; GCN-IR-NEXT: v_mov_b32_e32 v1, s2 -; GCN-IR-NEXT: v_alignbit_b32 v1, s3, v1, 24 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: s_and_b32 s4, s4, 0xff000000 +; GCN-IR-NEXT: s_min_u32 s8, s1, 32 +; GCN-IR-NEXT: s_lshr_b64 s[4:5], s[4:5], 24 +; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 +; GCN-IR-NEXT: s_min_u32 s1, s6, 1 +; GCN-IR-NEXT: s_flbit_i32_b32 s6, s5 +; GCN-IR-NEXT: s_min_u32 s6, s6, 32 +; GCN-IR-NEXT: s_lshl_b64 s[4:5], s[4:5], s6 +; GCN-IR-NEXT: s_min_u32 s4, s4, 1 +; GCN-IR-NEXT: s_or_b32 s4, s5, s4 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GCN-IR-NEXT: s_or_b32 s1, s7, s1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s1 +; GCN-IR-NEXT: s_sub_i32 s1, 32, s6 +; GCN-IR-NEXT: v_ldexp_f32_e64 v0, v0, s1 +; GCN-IR-NEXT: v_rcp_f32_e32 v2, v0 +; GCN-IR-NEXT: s_sub_i32 s4, 32, s8 +; GCN-IR-NEXT: v_ldexp_f32_e64 v1, v1, s4 +; GCN-IR-NEXT: s_mov_b32 s1, s9 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 @@ -773,8 +911,8 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc ; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GCN-IR-NEXT: buffer_store_short v3, off, s[4:7], 0 offset:4 -; GCN-IR-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-IR-NEXT: buffer_store_short v3, off, s[0:3], 0 offset:4 +; GCN-IR-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = lshr i48 %x, 24 %2 = lshr i48 %y, 24 @@ -1430,13 +1568,21 @@ define amdgpu_kernel void @s_test_udiv24_k_num_i64(ptr addrspace(1) %out, i64 %x ; GCN-LABEL: s_test_udiv24_k_num_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s4, 0x41c00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s2, s3, 8 +; GCN-NEXT: s_flbit_i32_b32 s2, 0 +; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: s_min_u32 s6, s2, 32 +; GCN-NEXT: s_lshr_b32 s4, s3, 8 +; GCN-NEXT: s_lshl_b64 s[2:3], s[4:5], s6 +; GCN-NEXT: s_min_u32 s2, s2, 1 +; GCN-NEXT: s_or_b32 s2, s3, s2 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-NEXT: s_sub_i32 s4, 32, s6 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GCN-NEXT: v_ldexp_f32_e64 v0, v0, s4 +; GCN-NEXT: v_rcp_f32_e32 v1, v0 +; GCN-NEXT: s_mov_b32 s4, 0x41c00000 ; GCN-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1 @@ -1451,13 +1597,21 @@ define amdgpu_kernel void @s_test_udiv24_k_num_i64(ptr addrspace(1) %out, i64 %x ; GCN-IR-LABEL: s_test_udiv24_k_num_i64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_mov_b32 s4, 0x41c00000 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_lshr_b32 s2, s3, 8 +; GCN-IR-NEXT: s_flbit_i32_b32 s2, 0 +; GCN-IR-NEXT: s_mov_b32 s5, 0 +; GCN-IR-NEXT: s_min_u32 s6, s2, 32 +; GCN-IR-NEXT: s_lshr_b32 s4, s3, 8 +; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[4:5], s6 +; GCN-IR-NEXT: s_min_u32 s2, s2, 1 +; GCN-IR-NEXT: s_or_b32 s2, s3, s2 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-IR-NEXT: s_sub_i32 s4, 32, s6 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GCN-IR-NEXT: v_ldexp_f32_e64 v0, v0, s4 +; GCN-IR-NEXT: v_rcp_f32_e32 v1, v0 +; GCN-IR-NEXT: s_mov_b32 s4, 0x41c00000 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v1 @@ -1478,44 +1632,60 @@ define amdgpu_kernel void @s_test_udiv24_k_den_i64(ptr addrspace(1) %out, i64 %x ; GCN-LABEL: s_test_udiv24_k_den_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s2, s3, 8 +; GCN-NEXT: s_flbit_i32_b32 s2, 0 +; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: s_min_u32 s8, s2, 32 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_lshr_b32 s4, s3, 8 +; GCN-NEXT: s_lshl_b64 s[2:3], s[4:5], s8 +; GCN-NEXT: s_min_u32 s2, s2, 1 +; GCN-NEXT: s_or_b32 s2, s3, s2 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GCN-NEXT: s_mov_b32 s2, 0x46b6fe00 ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_sub_i32 s0, 32, s8 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: v_ldexp_f32_e64 v0, v0, s0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1 -; GCN-NEXT: v_mad_f32 v0, -v1, s2, v0 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s2 -; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_mov_b32 s0, 0x46b6fe00 +; GCN-NEXT: v_mad_f32 v0, -v1, s0, v0 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s0 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_udiv24_k_den_i64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_lshr_b32 s2, s3, 8 +; GCN-IR-NEXT: s_flbit_i32_b32 s2, 0 +; GCN-IR-NEXT: s_mov_b32 s5, 0 +; GCN-IR-NEXT: s_min_u32 s8, s2, 32 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_lshr_b32 s4, s3, 8 +; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[4:5], s8 +; GCN-IR-NEXT: s_min_u32 s2, s2, 1 +; GCN-IR-NEXT: s_or_b32 s2, s3, s2 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GCN-IR-NEXT: s_mov_b32 s2, 0x46b6fe00 ; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: s_sub_i32 s0, 32, s8 +; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: v_ldexp_f32_e64 v0, v0, s0 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v1 -; GCN-IR-NEXT: v_mad_f32 v0, -v1, s2, v0 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s2 -; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: s_mov_b32 s0, 0x46b6fe00 +; GCN-IR-NEXT: v_mad_f32 v0, -v1, s0, v0 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s0 ; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %x.shr = lshr i64 %x, 40 @@ -1528,10 +1698,18 @@ define i64 @v_test_udiv24_k_num_i64(i64 %x) { ; GCN-LABEL: v_test_udiv24_k_num_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_flbit_i32_b32 s4, 0 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_min_u32 s4, s4, 32 +; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], s4 +; GCN-NEXT: s_sub_i32 s4, 32, s4 +; GCN-NEXT: v_min_u32_e32 v0, 1, v0 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GCN-NEXT: v_ldexp_f32_e64 v0, v0, s4 +; GCN-NEXT: v_rcp_f32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s4, 0x41c00000 -; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1 @@ -1545,10 +1723,18 @@ define i64 @v_test_udiv24_k_num_i64(i64 %x) { ; GCN-IR-LABEL: v_test_udiv24_k_num_i64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: s_flbit_i32_b32 s4, 0 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: s_min_u32 s4, s4, 32 +; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], s4 +; GCN-IR-NEXT: s_sub_i32 s4, 32, s4 +; GCN-IR-NEXT: v_min_u32_e32 v0, 1, v0 +; GCN-IR-NEXT: v_or_b32_e32 v0, v1, v0 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GCN-IR-NEXT: v_ldexp_f32_e64 v0, v0, s4 +; GCN-IR-NEXT: v_rcp_f32_e32 v1, v0 ; GCN-IR-NEXT: s_mov_b32 s4, 0x41c00000 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v1 @@ -1567,10 +1753,18 @@ define i64 @v_test_udiv24_pow2_k_num_i64(i64 %x) { ; GCN-LABEL: v_test_udiv24_pow2_k_num_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_flbit_i32_b32 s4, 0 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_min_u32 s4, s4, 32 +; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], s4 +; GCN-NEXT: s_sub_i32 s4, 32, s4 +; GCN-NEXT: v_min_u32_e32 v0, 1, v0 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GCN-NEXT: v_ldexp_f32_e64 v0, v0, s4 +; GCN-NEXT: v_rcp_f32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s4, 0x47000000 -; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x47000000, v1 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1 @@ -1584,10 +1778,18 @@ define i64 @v_test_udiv24_pow2_k_num_i64(i64 %x) { ; GCN-IR-LABEL: v_test_udiv24_pow2_k_num_i64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: s_flbit_i32_b32 s4, 0 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: s_min_u32 s4, s4, 32 +; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], s4 +; GCN-IR-NEXT: s_sub_i32 s4, 32, s4 +; GCN-IR-NEXT: v_min_u32_e32 v0, 1, v0 +; GCN-IR-NEXT: v_or_b32_e32 v0, v1, v0 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GCN-IR-NEXT: v_ldexp_f32_e64 v0, v0, s4 +; GCN-IR-NEXT: v_rcp_f32_e32 v1, v0 ; GCN-IR-NEXT: s_mov_b32 s4, 0x47000000 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x47000000, v1 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v1 @@ -1613,17 +1815,25 @@ define i64 @v_test_udiv24_pow2_k_den_i64(i64 %x) { ; GCN-IR-LABEL: v_test_udiv24_pow2_k_den_i64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: s_flbit_i32_b32 s4, 0 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: s_min_u32 s4, s4, 32 +; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], s4 +; GCN-IR-NEXT: s_sub_i32 s4, 32, s4 +; GCN-IR-NEXT: v_min_u32_e32 v0, 1, v0 +; GCN-IR-NEXT: v_or_b32_e32 v0, v1, v0 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GCN-IR-NEXT: s_mov_b32 s4, 0x47000000 +; GCN-IR-NEXT: v_ldexp_f32_e64 v0, v0, s4 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x38000000, v0 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v1 +; GCN-IR-NEXT: s_mov_b32 s4, 0x47000000 ; GCN-IR-NEXT: v_mad_f32 v0, -v1, s4, v0 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4 -; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc ; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: s_setpc_b64 s[30:31] %x.shr = lshr i64 %x, 40 %result = udiv i64 %x.shr, 32768 diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll index eb1b844ad8938..b398a5ac4354d 100644 --- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll @@ -215,8 +215,9 @@ define amdgpu_kernel void @uitofp_v2i16_to_v2f16( ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f16_u16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f16_u16_e32 v0, v0 +; VI-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -238,7 +239,9 @@ define amdgpu_kernel void @uitofp_v2i16_to_v2f16( ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: s_endpgm @@ -260,7 +263,9 @@ define amdgpu_kernel void @uitofp_v2i16_to_v2f16( ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-FAKE16-NEXT: v_cvt_f16_u16_e32 v0, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cvt_f16_u16_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-FAKE16-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll index 6ab3022a91cd7..55f56aece16ea 100644 --- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll @@ -695,7 +695,7 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i ; SDAG-VI-NEXT: v_med3_i32 v1, s2, 0, v0 ; SDAG-VI-NEXT: v_med3_i32 v0, s3, 0, v0 ; SDAG-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SDAG-VI-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_e32 v2, v1, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 ; SDAG-VI-NEXT: flat_store_dword v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll index 801324eec454e..0b70c22245177 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll @@ -1090,9 +1090,13 @@ define i16 @test_vector_reduce_and_v3i16(<3 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_and_v3i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_or_b32_e32 v1, 0xffff0000, v2 +; GFX7-SDAG-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-SDAG-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_and_v3i16: diff --git a/llvm/test/CodeGen/PowerPC/add_cmp.ll b/llvm/test/CodeGen/PowerPC/add_cmp.ll index cbe16a498a538..47940cfbb46bb 100644 --- a/llvm/test/CodeGen/PowerPC/add_cmp.ll +++ b/llvm/test/CodeGen/PowerPC/add_cmp.ll @@ -10,7 +10,7 @@ entry: ; CHECK: === addiCmpiUnsigned ; CHECK: Optimized lowered selection DAG: %bb.0 'addiCmpiUnsigned:entry' -; CHECK: [[REG1:t[0-9]+]]: i32 = truncate {{t[0-9]+}} +; CHECK: [[REG1:t[0-9]+]]: i32 = truncate nuw {{t[0-9]+}} ; CHECK: [[REG2:t[0-9]+]]: i32 = add nuw [[REG1]], Constant:i32<10> ; CHECK: {{t[0-9]+}}: i1 = setcc [[REG2]], Constant:i32<100>, setugt:ch } @@ -23,7 +23,7 @@ entry: ; CHECK: === addiCmpiSigned ; CHECK: Optimized lowered selection DAG: %bb.0 'addiCmpiSigned:entry' -; CHECK: [[REG1:t[0-9]+]]: i32 = truncate {{t[0-9]+}} +; CHECK: [[REG1:t[0-9]+]]: i32 = truncate nsw {{t[0-9]+}} ; CHECK: [[REG2:t[0-9]+]]: i32 = add nsw [[REG1]], Constant:i32<16> ; CHECK: {{t[0-9]+}}: i1 = setcc [[REG2]], Constant:i32<30>, setgt:ch } @@ -36,7 +36,7 @@ entry: ; CHECK: === addiCmpiUnsignedOverflow ; CHECK: Optimized lowered selection DAG: %bb.0 'addiCmpiUnsignedOverflow:entry' -; CHECK: [[REG1:t[0-9]+]]: i32 = truncate {{t[0-9]+}} +; CHECK: [[REG1:t[0-9]+]]: i32 = truncate nuw {{t[0-9]+}} ; CHECK: [[REG2:t[0-9]+]]: i32 = add nuw [[REG1]], Constant:i32<110> ; CHECK: {{t[0-9]+}}: i1 = setcc [[REG2]], Constant:i32<100>, setugt:ch } @@ -49,7 +49,7 @@ entry: ; CHECK: === addiCmpiSignedOverflow ; CHECK: Optimized lowered selection DAG: %bb.0 'addiCmpiSignedOverflow:entry' -; CHECK: [[REG1:t[0-9]+]]: i16 = truncate {{t[0-9]+}} +; CHECK: [[REG1:t[0-9]+]]: i16 = truncate nsw {{t[0-9]+}} ; CHECK: [[REG2:t[0-9]+]]: i16 = add nsw [[REG1]], Constant:i16<16> ; CHECK: {{t[0-9]+}}: i1 = setcc [[REG2]], Constant:i16<-32767>, setgt:ch } diff --git a/llvm/test/CodeGen/PowerPC/f128-bitcast.ll b/llvm/test/CodeGen/PowerPC/f128-bitcast.ll index 55ba3cb1e0538..2945409c103f2 100644 --- a/llvm/test/CodeGen/PowerPC/f128-bitcast.ll +++ b/llvm/test/CodeGen/PowerPC/f128-bitcast.ll @@ -35,17 +35,22 @@ entry: define i64 @getPart2(fp128 %in) local_unnamed_addr { ; CHECK-LABEL: getPart2: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: mfvsrd r3, v2 +; CHECK-NEXT: stxv v2, -16(r1) +; CHECK-NEXT: ld r3, -8(r1) ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: getPart2: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: mfvsrd r3, v2 +; CHECK-BE-NEXT: stxv v2, -16(r1) +; CHECK-BE-NEXT: ld r3, -16(r1) ; CHECK-BE-NEXT: blr ; ; CHECK-P8-LABEL: getPart2: ; CHECK-P8: # %bb.0: # %entry -; CHECK-P8-NEXT: mfvsrd r3, v2 +; CHECK-P8-NEXT: xxswapd vs0, v2 +; CHECK-P8-NEXT: addi r3, r1, -16 +; CHECK-P8-NEXT: stxvd2x vs0, 0, r3 +; CHECK-P8-NEXT: ld r3, -8(r1) ; CHECK-P8-NEXT: blr entry: %0 = bitcast fp128 %in to i128 diff --git a/llvm/test/CodeGen/VE/Scalar/atomic.ll b/llvm/test/CodeGen/VE/Scalar/atomic.ll index 2fa6b0d7bcc1d..a7065d3c430ff 100644 --- a/llvm/test/CodeGen/VE/Scalar/atomic.ll +++ b/llvm/test/CodeGen/VE/Scalar/atomic.ll @@ -83,6 +83,7 @@ define signext i32 @test_atomic_fetch_and_4() { ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: or %s2, 0, %s1 ; CHECK-NEXT: and %s1, 1, %s2 +; CHECK-NEXT: and %s1, %s1, (32)0 ; CHECK-NEXT: cas.w %s1, (%s0), %s2 ; CHECK-NEXT: brne.w %s1, %s2, .LBB2_1 ; CHECK-NEXT: # %bb.2: # %atomicrmw.end diff --git a/llvm/test/CodeGen/VE/Scalar/atomic_cmp_swap.ll b/llvm/test/CodeGen/VE/Scalar/atomic_cmp_swap.ll index b70f0ea602d0b..69b71e9b688e2 100644 --- a/llvm/test/CodeGen/VE/Scalar/atomic_cmp_swap.ll +++ b/llvm/test/CodeGen/VE/Scalar/atomic_cmp_swap.ll @@ -84,6 +84,7 @@ define zeroext i1 @_Z26atomic_cmp_swap_relaxed_i1RNSt3__16atomicIbEERbb(ptr noca ; CHECK-NEXT: ld1b.zx %s4, (, %s1) ; CHECK-NEXT: and %s3, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 +; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 ; CHECK-NEXT: ldl.sx %s5, (, %s3) ; CHECK-NEXT: sla.w.sx %s6, (56)0, %s0 @@ -128,10 +129,12 @@ define signext i8 @_Z26atomic_cmp_swap_relaxed_i8RNSt3__16atomicIcEERcc(ptr noca ; CHECK-NEXT: ld1b.zx %s4, (, %s1) ; CHECK-NEXT: and %s3, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 +; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 ; CHECK-NEXT: sla.w.sx %s5, (56)0, %s0 -; CHECK-NEXT: ldl.sx %s6, (, %s3) ; CHECK-NEXT: and %s2, %s2, (56)0 +; CHECK-NEXT: ldl.sx %s6, (, %s3) +; CHECK-NEXT: and %s2, %s2, (32)0 ; CHECK-NEXT: sla.w.sx %s2, %s2, %s0 ; CHECK-NEXT: sla.w.sx %s4, %s4, %s0 ; CHECK-NEXT: nnd %s5, %s5, %s6 @@ -173,6 +176,7 @@ define zeroext i8 @_Z26atomic_cmp_swap_relaxed_u8RNSt3__16atomicIhEERhh(ptr noca ; CHECK-NEXT: ld1b.zx %s4, (, %s1) ; CHECK-NEXT: and %s3, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 +; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 ; CHECK-NEXT: ldl.sx %s5, (, %s3) ; CHECK-NEXT: sla.w.sx %s6, (56)0, %s0 @@ -217,10 +221,12 @@ define signext i16 @_Z27atomic_cmp_swap_relaxed_i16RNSt3__16atomicIsEERss(ptr no ; CHECK-NEXT: ld2b.zx %s4, (, %s1) ; CHECK-NEXT: and %s3, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 +; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 ; CHECK-NEXT: sla.w.sx %s5, (48)0, %s0 -; CHECK-NEXT: ldl.sx %s6, (, %s3) ; CHECK-NEXT: and %s2, %s2, (48)0 +; CHECK-NEXT: ldl.sx %s6, (, %s3) +; CHECK-NEXT: and %s2, %s2, (32)0 ; CHECK-NEXT: sla.w.sx %s2, %s2, %s0 ; CHECK-NEXT: sla.w.sx %s4, %s4, %s0 ; CHECK-NEXT: nnd %s5, %s5, %s6 @@ -262,6 +268,7 @@ define zeroext i16 @_Z27atomic_cmp_swap_relaxed_u16RNSt3__16atomicItEERtt(ptr no ; CHECK-NEXT: ld2b.zx %s4, (, %s1) ; CHECK-NEXT: and %s3, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 +; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 ; CHECK-NEXT: ldl.sx %s5, (, %s3) ; CHECK-NEXT: sla.w.sx %s6, (48)0, %s0 @@ -524,6 +531,7 @@ define zeroext i1 @_Z26atomic_cmp_swap_acquire_i1RNSt3__16atomicIbEERbb(ptr noca ; CHECK-NEXT: ld1b.zx %s4, (, %s1) ; CHECK-NEXT: and %s3, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 +; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 ; CHECK-NEXT: ldl.sx %s5, (, %s3) ; CHECK-NEXT: sla.w.sx %s6, (56)0, %s0 @@ -569,10 +577,12 @@ define signext i8 @_Z26atomic_cmp_swap_acquire_i8RNSt3__16atomicIcEERcc(ptr noca ; CHECK-NEXT: ld1b.zx %s4, (, %s1) ; CHECK-NEXT: and %s3, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 +; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 ; CHECK-NEXT: sla.w.sx %s5, (56)0, %s0 -; CHECK-NEXT: ldl.sx %s6, (, %s3) ; CHECK-NEXT: and %s2, %s2, (56)0 +; CHECK-NEXT: ldl.sx %s6, (, %s3) +; CHECK-NEXT: and %s2, %s2, (32)0 ; CHECK-NEXT: sla.w.sx %s2, %s2, %s0 ; CHECK-NEXT: sla.w.sx %s4, %s4, %s0 ; CHECK-NEXT: nnd %s5, %s5, %s6 @@ -615,6 +625,7 @@ define zeroext i8 @_Z26atomic_cmp_swap_acquire_u8RNSt3__16atomicIhEERhh(ptr noca ; CHECK-NEXT: ld1b.zx %s4, (, %s1) ; CHECK-NEXT: and %s3, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 +; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 ; CHECK-NEXT: ldl.sx %s5, (, %s3) ; CHECK-NEXT: sla.w.sx %s6, (56)0, %s0 @@ -660,10 +671,12 @@ define signext i16 @_Z27atomic_cmp_swap_acquire_i16RNSt3__16atomicIsEERss(ptr no ; CHECK-NEXT: ld2b.zx %s4, (, %s1) ; CHECK-NEXT: and %s3, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 +; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 ; CHECK-NEXT: sla.w.sx %s5, (48)0, %s0 -; CHECK-NEXT: ldl.sx %s6, (, %s3) ; CHECK-NEXT: and %s2, %s2, (48)0 +; CHECK-NEXT: ldl.sx %s6, (, %s3) +; CHECK-NEXT: and %s2, %s2, (32)0 ; CHECK-NEXT: sla.w.sx %s2, %s2, %s0 ; CHECK-NEXT: sla.w.sx %s4, %s4, %s0 ; CHECK-NEXT: nnd %s5, %s5, %s6 @@ -706,6 +719,7 @@ define zeroext i16 @_Z27atomic_cmp_swap_acquire_u16RNSt3__16atomicItEERtt(ptr no ; CHECK-NEXT: ld2b.zx %s4, (, %s1) ; CHECK-NEXT: and %s3, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 +; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 ; CHECK-NEXT: ldl.sx %s5, (, %s3) ; CHECK-NEXT: sla.w.sx %s6, (48)0, %s0 @@ -974,6 +988,7 @@ define zeroext i1 @_Z26atomic_cmp_swap_seq_cst_i1RNSt3__16atomicIbEERbb(ptr noca ; CHECK-NEXT: fencem 3 ; CHECK-NEXT: and %s3, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 +; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 ; CHECK-NEXT: ldl.sx %s5, (, %s3) ; CHECK-NEXT: sla.w.sx %s6, (56)0, %s0 @@ -1020,10 +1035,12 @@ define signext i8 @_Z26atomic_cmp_swap_seq_cst_i8RNSt3__16atomicIcEERcc(ptr noca ; CHECK-NEXT: fencem 3 ; CHECK-NEXT: and %s3, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 +; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 ; CHECK-NEXT: sla.w.sx %s5, (56)0, %s0 -; CHECK-NEXT: ldl.sx %s6, (, %s3) ; CHECK-NEXT: and %s2, %s2, (56)0 +; CHECK-NEXT: ldl.sx %s6, (, %s3) +; CHECK-NEXT: and %s2, %s2, (32)0 ; CHECK-NEXT: sla.w.sx %s2, %s2, %s0 ; CHECK-NEXT: sla.w.sx %s4, %s4, %s0 ; CHECK-NEXT: nnd %s5, %s5, %s6 @@ -1067,6 +1084,7 @@ define zeroext i8 @_Z26atomic_cmp_swap_seq_cst_u8RNSt3__16atomicIhEERhh(ptr noca ; CHECK-NEXT: fencem 3 ; CHECK-NEXT: and %s3, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 +; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 ; CHECK-NEXT: ldl.sx %s5, (, %s3) ; CHECK-NEXT: sla.w.sx %s6, (56)0, %s0 @@ -1113,10 +1131,12 @@ define signext i16 @_Z27atomic_cmp_swap_seq_cst_i16RNSt3__16atomicIsEERss(ptr no ; CHECK-NEXT: fencem 3 ; CHECK-NEXT: and %s3, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 +; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 ; CHECK-NEXT: sla.w.sx %s5, (48)0, %s0 -; CHECK-NEXT: ldl.sx %s6, (, %s3) ; CHECK-NEXT: and %s2, %s2, (48)0 +; CHECK-NEXT: ldl.sx %s6, (, %s3) +; CHECK-NEXT: and %s2, %s2, (32)0 ; CHECK-NEXT: sla.w.sx %s2, %s2, %s0 ; CHECK-NEXT: sla.w.sx %s4, %s4, %s0 ; CHECK-NEXT: nnd %s5, %s5, %s6 @@ -1160,6 +1180,7 @@ define zeroext i16 @_Z27atomic_cmp_swap_seq_cst_u16RNSt3__16atomicItEERtt(ptr no ; CHECK-NEXT: fencem 3 ; CHECK-NEXT: and %s3, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 +; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 ; CHECK-NEXT: ldl.sx %s5, (, %s3) ; CHECK-NEXT: sla.w.sx %s6, (48)0, %s0 diff --git a/llvm/test/CodeGen/VE/Scalar/atomic_swap.ll b/llvm/test/CodeGen/VE/Scalar/atomic_swap.ll index 23bb7d6efeacd..c773905b0f972 100644 --- a/llvm/test/CodeGen/VE/Scalar/atomic_swap.ll +++ b/llvm/test/CodeGen/VE/Scalar/atomic_swap.ll @@ -82,6 +82,7 @@ define zeroext i1 @_Z22atomic_swap_relaxed_i1RNSt3__16atomicIbEEb(ptr nocapture ; CHECK-LABEL: _Z22atomic_swap_relaxed_i1RNSt3__16atomicIbEEb: ; CHECK: # %bb.0: ; CHECK-NEXT: and %s2, 3, %s0 +; CHECK-NEXT: and %s2, %s2, (32)0 ; CHECK-NEXT: sla.w.sx %s3, %s2, 3 ; CHECK-NEXT: sla.w.sx %s1, %s1, %s3 ; CHECK-NEXT: and %s0, -4, %s0 @@ -103,6 +104,7 @@ define signext i8 @_Z22atomic_swap_relaxed_i8RNSt3__16atomicIcEEc(ptr nocapture ; CHECK-LABEL: _Z22atomic_swap_relaxed_i8RNSt3__16atomicIcEEc: ; CHECK: # %bb.0: ; CHECK-NEXT: and %s2, 3, %s0 +; CHECK-NEXT: and %s2, %s2, (32)0 ; CHECK-NEXT: sla.w.sx %s3, %s2, 3 ; CHECK-NEXT: sla.w.sx %s1, %s1, %s3 ; CHECK-NEXT: and %s0, -4, %s0 @@ -122,6 +124,7 @@ define zeroext i8 @_Z22atomic_swap_relaxed_u8RNSt3__16atomicIhEEh(ptr nocapture ; CHECK-LABEL: _Z22atomic_swap_relaxed_u8RNSt3__16atomicIhEEh: ; CHECK: # %bb.0: ; CHECK-NEXT: and %s2, 3, %s0 +; CHECK-NEXT: and %s2, %s2, (32)0 ; CHECK-NEXT: sla.w.sx %s3, %s2, 3 ; CHECK-NEXT: sla.w.sx %s1, %s1, %s3 ; CHECK-NEXT: and %s0, -4, %s0 @@ -140,6 +143,7 @@ define signext i16 @_Z23atomic_swap_relaxed_i16RNSt3__16atomicIsEEs(ptr nocaptur ; CHECK-LABEL: _Z23atomic_swap_relaxed_i16RNSt3__16atomicIsEEs: ; CHECK: # %bb.0: ; CHECK-NEXT: and %s2, 3, %s0 +; CHECK-NEXT: and %s2, %s2, (32)0 ; CHECK-NEXT: sla.w.sx %s3, %s2, 3 ; CHECK-NEXT: sla.w.sx %s1, %s1, %s3 ; CHECK-NEXT: and %s0, -4, %s0 @@ -159,6 +163,7 @@ define zeroext i16 @_Z23atomic_swap_relaxed_u16RNSt3__16atomicItEEt(ptr nocaptur ; CHECK-LABEL: _Z23atomic_swap_relaxed_u16RNSt3__16atomicItEEt: ; CHECK: # %bb.0: ; CHECK-NEXT: and %s2, 3, %s0 +; CHECK-NEXT: and %s2, %s2, (32)0 ; CHECK-NEXT: sla.w.sx %s3, %s2, 3 ; CHECK-NEXT: sla.w.sx %s1, %s1, %s3 ; CHECK-NEXT: and %s0, -4, %s0 @@ -321,6 +326,7 @@ define zeroext i1 @_Z22atomic_swap_acquire_i1RNSt3__16atomicIbEEb(ptr nocapture ; CHECK-LABEL: _Z22atomic_swap_acquire_i1RNSt3__16atomicIbEEb: ; CHECK: # %bb.0: ; CHECK-NEXT: and %s2, 3, %s0 +; CHECK-NEXT: and %s2, %s2, (32)0 ; CHECK-NEXT: sla.w.sx %s3, %s2, 3 ; CHECK-NEXT: sla.w.sx %s1, %s1, %s3 ; CHECK-NEXT: and %s0, -4, %s0 @@ -343,6 +349,7 @@ define signext i8 @_Z22atomic_swap_acquire_i8RNSt3__16atomicIcEEc(ptr nocapture ; CHECK-LABEL: _Z22atomic_swap_acquire_i8RNSt3__16atomicIcEEc: ; CHECK: # %bb.0: ; CHECK-NEXT: and %s2, 3, %s0 +; CHECK-NEXT: and %s2, %s2, (32)0 ; CHECK-NEXT: sla.w.sx %s3, %s2, 3 ; CHECK-NEXT: sla.w.sx %s1, %s1, %s3 ; CHECK-NEXT: and %s0, -4, %s0 @@ -363,6 +370,7 @@ define zeroext i8 @_Z22atomic_swap_acquire_u8RNSt3__16atomicIhEEh(ptr nocapture ; CHECK-LABEL: _Z22atomic_swap_acquire_u8RNSt3__16atomicIhEEh: ; CHECK: # %bb.0: ; CHECK-NEXT: and %s2, 3, %s0 +; CHECK-NEXT: and %s2, %s2, (32)0 ; CHECK-NEXT: sla.w.sx %s3, %s2, 3 ; CHECK-NEXT: sla.w.sx %s1, %s1, %s3 ; CHECK-NEXT: and %s0, -4, %s0 @@ -382,6 +390,7 @@ define signext i16 @_Z23atomic_swap_acquire_i16RNSt3__16atomicIsEEs(ptr nocaptur ; CHECK-LABEL: _Z23atomic_swap_acquire_i16RNSt3__16atomicIsEEs: ; CHECK: # %bb.0: ; CHECK-NEXT: and %s2, 3, %s0 +; CHECK-NEXT: and %s2, %s2, (32)0 ; CHECK-NEXT: sla.w.sx %s3, %s2, 3 ; CHECK-NEXT: sla.w.sx %s1, %s1, %s3 ; CHECK-NEXT: and %s0, -4, %s0 @@ -402,6 +411,7 @@ define zeroext i16 @_Z23atomic_swap_acquire_u16RNSt3__16atomicItEEt(ptr nocaptur ; CHECK-LABEL: _Z23atomic_swap_acquire_u16RNSt3__16atomicItEEt: ; CHECK: # %bb.0: ; CHECK-NEXT: and %s2, 3, %s0 +; CHECK-NEXT: and %s2, %s2, (32)0 ; CHECK-NEXT: sla.w.sx %s3, %s2, 3 ; CHECK-NEXT: sla.w.sx %s1, %s1, %s3 ; CHECK-NEXT: and %s0, -4, %s0 @@ -570,6 +580,7 @@ define zeroext i1 @_Z22atomic_swap_seq_cst_i1RNSt3__16atomicIbEEb(ptr nocapture ; CHECK: # %bb.0: ; CHECK-NEXT: fencem 3 ; CHECK-NEXT: and %s2, 3, %s0 +; CHECK-NEXT: and %s2, %s2, (32)0 ; CHECK-NEXT: sla.w.sx %s3, %s2, 3 ; CHECK-NEXT: sla.w.sx %s1, %s1, %s3 ; CHECK-NEXT: and %s0, -4, %s0 @@ -593,6 +604,7 @@ define signext i8 @_Z22atomic_swap_seq_cst_i8RNSt3__16atomicIcEEc(ptr nocapture ; CHECK: # %bb.0: ; CHECK-NEXT: fencem 3 ; CHECK-NEXT: and %s2, 3, %s0 +; CHECK-NEXT: and %s2, %s2, (32)0 ; CHECK-NEXT: sla.w.sx %s3, %s2, 3 ; CHECK-NEXT: sla.w.sx %s1, %s1, %s3 ; CHECK-NEXT: and %s0, -4, %s0 @@ -614,6 +626,7 @@ define zeroext i8 @_Z22atomic_swap_seq_cst_u8RNSt3__16atomicIhEEh(ptr nocapture ; CHECK: # %bb.0: ; CHECK-NEXT: fencem 3 ; CHECK-NEXT: and %s2, 3, %s0 +; CHECK-NEXT: and %s2, %s2, (32)0 ; CHECK-NEXT: sla.w.sx %s3, %s2, 3 ; CHECK-NEXT: sla.w.sx %s1, %s1, %s3 ; CHECK-NEXT: and %s0, -4, %s0 @@ -634,6 +647,7 @@ define signext i16 @_Z23atomic_swap_seq_cst_i16RNSt3__16atomicIsEEs(ptr nocaptur ; CHECK: # %bb.0: ; CHECK-NEXT: fencem 3 ; CHECK-NEXT: and %s2, 3, %s0 +; CHECK-NEXT: and %s2, %s2, (32)0 ; CHECK-NEXT: sla.w.sx %s3, %s2, 3 ; CHECK-NEXT: sla.w.sx %s1, %s1, %s3 ; CHECK-NEXT: and %s0, -4, %s0 @@ -655,6 +669,7 @@ define zeroext i16 @_Z23atomic_swap_seq_cst_u16RNSt3__16atomicItEEt(ptr nocaptur ; CHECK: # %bb.0: ; CHECK-NEXT: fencem 3 ; CHECK-NEXT: and %s2, 3, %s0 +; CHECK-NEXT: and %s2, %s2, (32)0 ; CHECK-NEXT: sla.w.sx %s3, %s2, 3 ; CHECK-NEXT: sla.w.sx %s1, %s1, %s3 ; CHECK-NEXT: and %s0, -4, %s0 @@ -1200,6 +1215,7 @@ define zeroext i1 @_Z25atomic_swap_relaxed_gv_i1b(i1 zeroext %0) { ; CHECK-NEXT: and %s1, %s1, (32)0 ; CHECK-NEXT: lea.sl %s1, gv_i1@hi(, %s1) ; CHECK-NEXT: and %s2, 3, %s1 +; CHECK-NEXT: and %s2, %s2, (32)0 ; CHECK-NEXT: sla.w.sx %s3, %s2, 3 ; CHECK-NEXT: sla.w.sx %s0, %s0, %s3 ; CHECK-NEXT: and %s1, -4, %s1 @@ -1224,6 +1240,7 @@ define signext i8 @_Z25atomic_swap_relaxed_gv_i8c(i8 signext %0) { ; CHECK-NEXT: and %s1, %s1, (32)0 ; CHECK-NEXT: lea.sl %s1, gv_i8@hi(, %s1) ; CHECK-NEXT: and %s2, 3, %s1 +; CHECK-NEXT: and %s2, %s2, (32)0 ; CHECK-NEXT: sla.w.sx %s3, %s2, 3 ; CHECK-NEXT: sla.w.sx %s0, %s0, %s3 ; CHECK-NEXT: and %s1, -4, %s1 @@ -1246,6 +1263,7 @@ define zeroext i8 @_Z25atomic_swap_relaxed_gv_u8h(i8 zeroext %0) { ; CHECK-NEXT: and %s1, %s1, (32)0 ; CHECK-NEXT: lea.sl %s1, gv_u8@hi(, %s1) ; CHECK-NEXT: and %s2, 3, %s1 +; CHECK-NEXT: and %s2, %s2, (32)0 ; CHECK-NEXT: sla.w.sx %s3, %s2, 3 ; CHECK-NEXT: sla.w.sx %s0, %s0, %s3 ; CHECK-NEXT: and %s1, -4, %s1 @@ -1267,6 +1285,7 @@ define signext i16 @_Z26atomic_swap_relaxed_gv_i16s(i16 signext %0) { ; CHECK-NEXT: and %s1, %s1, (32)0 ; CHECK-NEXT: lea.sl %s1, gv_i16@hi(, %s1) ; CHECK-NEXT: and %s2, 3, %s1 +; CHECK-NEXT: and %s2, %s2, (32)0 ; CHECK-NEXT: sla.w.sx %s3, %s2, 3 ; CHECK-NEXT: sla.w.sx %s0, %s0, %s3 ; CHECK-NEXT: and %s1, -4, %s1 @@ -1289,6 +1308,7 @@ define zeroext i16 @_Z26atomic_swap_relaxed_gv_u16t(i16 zeroext %0) { ; CHECK-NEXT: and %s1, %s1, (32)0 ; CHECK-NEXT: lea.sl %s1, gv_u16@hi(, %s1) ; CHECK-NEXT: and %s2, 3, %s1 +; CHECK-NEXT: and %s2, %s2, (32)0 ; CHECK-NEXT: sla.w.sx %s3, %s2, 3 ; CHECK-NEXT: sla.w.sx %s0, %s0, %s3 ; CHECK-NEXT: and %s1, -4, %s1 diff --git a/llvm/test/CodeGen/VE/Scalar/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/VE/Scalar/atomicrmw-cond-sub-clamp.ll index 860c4004658db..1c76c51a34719 100644 --- a/llvm/test/CodeGen/VE/Scalar/atomicrmw-cond-sub-clamp.ll +++ b/llvm/test/CodeGen/VE/Scalar/atomicrmw-cond-sub-clamp.ll @@ -8,6 +8,7 @@ define i8 @atomicrmw_usub_cond_i8(ptr %ptr, i8 %val) { ; CHECK-NEXT: fencem 3 ; CHECK-NEXT: and %s2, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 +; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 ; CHECK-NEXT: sla.w.sx %s3, (56)0, %s0 ; CHECK-NEXT: ldl.sx %s5, (, %s2) @@ -24,6 +25,7 @@ define i8 @atomicrmw_usub_cond_i8(ptr %ptr, i8 %val) { ; CHECK-NEXT: cmpu.w %s7, %s7, %s4 ; CHECK-NEXT: cmov.w.ge %s5, %s34, %s7 ; CHECK-NEXT: and %s5, %s5, (56)0 +; CHECK-NEXT: and %s5, %s5, (32)0 ; CHECK-NEXT: sla.w.sx %s5, %s5, %s0 ; CHECK-NEXT: and %s7, %s6, %s3 ; CHECK-NEXT: or %s5, %s7, %s5 @@ -45,6 +47,7 @@ define i16 @atomicrmw_usub_cond_i16(ptr %ptr, i16 %val) { ; CHECK-NEXT: fencem 3 ; CHECK-NEXT: and %s2, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 +; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 ; CHECK-NEXT: sla.w.sx %s3, (48)0, %s0 ; CHECK-NEXT: ldl.sx %s5, (, %s2) @@ -61,6 +64,7 @@ define i16 @atomicrmw_usub_cond_i16(ptr %ptr, i16 %val) { ; CHECK-NEXT: cmpu.w %s7, %s7, %s4 ; CHECK-NEXT: cmov.w.ge %s5, %s34, %s7 ; CHECK-NEXT: and %s5, %s5, (48)0 +; CHECK-NEXT: and %s5, %s5, (32)0 ; CHECK-NEXT: sla.w.sx %s5, %s5, %s0 ; CHECK-NEXT: and %s7, %s6, %s3 ; CHECK-NEXT: or %s5, %s7, %s5 @@ -126,12 +130,14 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) { ; CHECK-NEXT: fencem 3 ; CHECK-NEXT: and %s1, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 +; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 ; CHECK-NEXT: sla.w.sx %s2, (56)0, %s0 -; CHECK-NEXT: ldl.sx %s4, (, %s1) ; CHECK-NEXT: xor %s2, -1, %s2 +; CHECK-NEXT: ldl.sx %s4, (, %s1) ; CHECK-NEXT: and %s2, %s2, (32)0 ; CHECK-NEXT: and %s3, %s3, (56)0 +; CHECK-NEXT: and %s3, %s3, (32)0 ; CHECK-NEXT: .LBB4_1: # %atomicrmw.start ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: or %s5, 0, %s4 @@ -162,12 +168,14 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) { ; CHECK-NEXT: fencem 3 ; CHECK-NEXT: and %s1, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 +; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 ; CHECK-NEXT: sla.w.sx %s2, (48)0, %s0 -; CHECK-NEXT: ldl.sx %s4, (, %s1) ; CHECK-NEXT: xor %s2, -1, %s2 +; CHECK-NEXT: ldl.sx %s4, (, %s1) ; CHECK-NEXT: and %s2, %s2, (32)0 ; CHECK-NEXT: and %s3, %s3, (48)0 +; CHECK-NEXT: and %s3, %s3, (32)0 ; CHECK-NEXT: .LBB5_1: # %atomicrmw.start ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: or %s5, 0, %s4 diff --git a/llvm/test/CodeGen/VE/Scalar/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/VE/Scalar/atomicrmw-uinc-udec-wrap.ll index 3cb1a4e1e477d..50e234119270a 100644 --- a/llvm/test/CodeGen/VE/Scalar/atomicrmw-uinc-udec-wrap.ll +++ b/llvm/test/CodeGen/VE/Scalar/atomicrmw-uinc-udec-wrap.ll @@ -8,6 +8,7 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) { ; CHECK-NEXT: fencem 3 ; CHECK-NEXT: and %s1, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 +; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 ; CHECK-NEXT: sla.w.sx %s2, (56)0, %s0 ; CHECK-NEXT: ldl.sx %s4, (, %s1) @@ -24,6 +25,7 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) { ; CHECK-NEXT: cmpu.w %s6, %s6, %s3 ; CHECK-NEXT: cmov.w.ge %s4, (0)1, %s6 ; CHECK-NEXT: and %s4, %s4, (56)0 +; CHECK-NEXT: and %s4, %s4, (32)0 ; CHECK-NEXT: sla.w.sx %s4, %s4, %s0 ; CHECK-NEXT: and %s6, %s5, %s2 ; CHECK-NEXT: or %s4, %s6, %s4 @@ -45,6 +47,7 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) { ; CHECK-NEXT: fencem 3 ; CHECK-NEXT: and %s1, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 +; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 ; CHECK-NEXT: sla.w.sx %s2, (48)0, %s0 ; CHECK-NEXT: ldl.sx %s4, (, %s1) @@ -61,6 +64,7 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) { ; CHECK-NEXT: cmpu.w %s6, %s6, %s3 ; CHECK-NEXT: cmov.w.ge %s4, (0)1, %s6 ; CHECK-NEXT: and %s4, %s4, (48)0 +; CHECK-NEXT: and %s4, %s4, (32)0 ; CHECK-NEXT: sla.w.sx %s4, %s4, %s0 ; CHECK-NEXT: and %s6, %s5, %s2 ; CHECK-NEXT: or %s4, %s6, %s4 @@ -125,6 +129,7 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) { ; CHECK-NEXT: fencem 3 ; CHECK-NEXT: and %s2, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 +; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 ; CHECK-NEXT: sla.w.sx %s3, (56)0, %s0 ; CHECK-NEXT: ldl.sx %s5, (, %s2) @@ -142,6 +147,7 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) { ; CHECK-NEXT: cmov.w.gt %s5, %s1, %s34 ; CHECK-NEXT: cmov.w.eq %s5, %s1, %s7 ; CHECK-NEXT: and %s5, %s5, (56)0 +; CHECK-NEXT: and %s5, %s5, (32)0 ; CHECK-NEXT: sla.w.sx %s5, %s5, %s0 ; CHECK-NEXT: and %s7, %s6, %s3 ; CHECK-NEXT: or %s5, %s7, %s5 @@ -163,6 +169,7 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) { ; CHECK-NEXT: fencem 3 ; CHECK-NEXT: and %s2, -4, %s0 ; CHECK-NEXT: and %s0, 3, %s0 +; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: sla.w.sx %s0, %s0, 3 ; CHECK-NEXT: sla.w.sx %s3, (48)0, %s0 ; CHECK-NEXT: ldl.sx %s5, (, %s2) @@ -180,6 +187,7 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) { ; CHECK-NEXT: cmov.w.gt %s5, %s1, %s34 ; CHECK-NEXT: cmov.w.eq %s5, %s1, %s7 ; CHECK-NEXT: and %s5, %s5, (48)0 +; CHECK-NEXT: and %s5, %s5, (32)0 ; CHECK-NEXT: sla.w.sx %s5, %s5, %s0 ; CHECK-NEXT: and %s7, %s6, %s3 ; CHECK-NEXT: or %s5, %s7, %s5 diff --git a/llvm/test/CodeGen/VE/Scalar/fabs.ll b/llvm/test/CodeGen/VE/Scalar/fabs.ll index a68e561d0098f..cf0bfb1869df5 100644 --- a/llvm/test/CodeGen/VE/Scalar/fabs.ll +++ b/llvm/test/CodeGen/VE/Scalar/fabs.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=ve | FileCheck %s ;;; Test ‘llvm.fabs.*’ Intrinsic @@ -33,6 +34,7 @@ define float @fabs_float_var(float %0) { ; CHECK: # %bb.0: ; CHECK-NEXT: sra.l %s0, %s0, 32 ; CHECK-NEXT: and %s0, %s0, (33)0 +; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: sll %s0, %s0, 32 ; CHECK-NEXT: b.l.t (, %s10) %2 = tail call fast float @llvm.fabs.f32(float %0) @@ -58,7 +60,19 @@ declare double @llvm.fabs.f64(double) ; Function Attrs: nounwind readnone define fp128 @fabs_quad_var(fp128 %0) { ; CHECK-LABEL: fabs_quad_var: -; CHECK: .LBB{{[0-9]+}}_2: +; CHECK: # %bb.0: +; CHECK-NEXT: adds.l %s11, -16, %s11 +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB2_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB2_2: ; CHECK-NEXT: st %s1, (, %s11) ; CHECK-NEXT: st %s0, 8(, %s11) ; CHECK-NEXT: ld1b.zx %s0, 15(, %s11) @@ -97,9 +111,9 @@ define double @fabs_double_zero() { define fp128 @fabs_quad_zero() { ; CHECK-LABEL: fabs_quad_zero: ; CHECK: # %bb.0: -; CHECK-NEXT: lea %s0, .LCPI{{[0-9]+}}_0@lo +; CHECK-NEXT: lea %s0, .LCPI5_0@lo ; CHECK-NEXT: and %s0, %s0, (32)0 -; CHECK-NEXT: lea.sl %s2, .LCPI{{[0-9]+}}_0@hi(, %s0) +; CHECK-NEXT: lea.sl %s2, .LCPI5_0@hi(, %s0) ; CHECK-NEXT: ld %s0, 8(, %s2) ; CHECK-NEXT: ld %s1, (, %s2) ; CHECK-NEXT: b.l.t (, %s10) @@ -128,9 +142,9 @@ define double @fabs_double_const() { define fp128 @fabs_quad_const() { ; CHECK-LABEL: fabs_quad_const: ; CHECK: # %bb.0: -; CHECK-NEXT: lea %s0, .LCPI{{[0-9]+}}_0@lo +; CHECK-NEXT: lea %s0, .LCPI8_0@lo ; CHECK-NEXT: and %s0, %s0, (32)0 -; CHECK-NEXT: lea.sl %s2, .LCPI{{[0-9]+}}_0@hi(, %s0) +; CHECK-NEXT: lea.sl %s2, .LCPI8_0@hi(, %s0) ; CHECK-NEXT: ld %s0, 8(, %s2) ; CHECK-NEXT: ld %s1, (, %s2) ; CHECK-NEXT: b.l.t (, %s10) diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll index 1a2cfd69650b8..61b1b060c5899 100644 --- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll @@ -64,7 +64,9 @@ define <2 x i32> @stest_f64i32(<2 x double> %x) nounwind { ; AVX512-LABEL: stest_f64i32: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vcvttpd2qq %xmm0, %xmm0 -; AVX512-NEXT: vpmovsqd %xmm0, %xmm0 +; AVX512-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i64> @@ -143,7 +145,8 @@ define <2 x i32> @utest_f64i32(<2 x double> %x) nounwind { ; AVX512-LABEL: utest_f64i32: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vcvttpd2uqq %xmm0, %xmm0 -; AVX512-NEXT: vpmovusqd %xmm0, %xmm0 +; AVX512-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512-NEXT: retq entry: %conv = fptoui <2 x double> %x to <2 x i64> @@ -210,9 +213,10 @@ define <2 x i32> @ustest_f64i32(<2 x double> %x) nounwind { ; AVX512-LABEL: ustest_f64i32: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vcvttpd2qq %xmm0, %xmm0 +; AVX512-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmovusqd %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i64> @@ -332,7 +336,9 @@ define <4 x i32> @stest_f32i32(<4 x float> %x) nounwind { ; AVX512-LABEL: stest_f32i32: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vcvttps2qq %xmm0, %ymm0 -; AVX512-NEXT: vpmovsqd %ymm0, %xmm0 +; AVX512-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 +; AVX512-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 +; AVX512-NEXT: vpmovqd %ymm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq entry: @@ -473,7 +479,8 @@ define <4 x i32> @utest_f32i32(<4 x float> %x) nounwind { ; AVX512-LABEL: utest_f32i32: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vcvttps2uqq %xmm0, %ymm0 -; AVX512-NEXT: vpmovusqd %ymm0, %xmm0 +; AVX512-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 +; AVX512-NEXT: vpmovqd %ymm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq entry: @@ -586,9 +593,10 @@ define <4 x i32> @ustest_f32i32(<4 x float> %x) nounwind { ; AVX512-LABEL: ustest_f32i32: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vcvttps2qq %xmm0, %ymm0 +; AVX512-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpmovusqd %ymm0, %xmm0 +; AVX512-NEXT: vpmovqd %ymm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq entry: @@ -1056,14 +1064,38 @@ define <2 x i16> @stest_f64i16(<2 x double> %x) nounwind { ; SSE-LABEL: stest_f64i16: ; SSE: # %bb.0: # %entry ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: packssdw %xmm0, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,u,u] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [4294934528,4294934528,u,u] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] ; SSE-NEXT: retq ; -; AVX-LABEL: stest_f64i16: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: stest_f64i16: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767] +; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] +; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-NEXT: retq +; +; AVX512-LABEL: stest_f64i16: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpmovdw %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i32> %0 = icmp slt <2 x i32> %conv, @@ -1138,11 +1170,24 @@ define <2 x i16> @ustest_f64i16(<2 x double> %x) nounwind { ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] ; SSE-NEXT: retq ; -; AVX-LABEL: ustest_f64i16: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: ustest_f64i16: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535] +; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-NEXT: retq +; +; AVX512-LABEL: ustest_f64i16: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovdw %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i32> %0 = icmp slt <2 x i32> %conv, @@ -1157,14 +1202,38 @@ define <4 x i16> @stest_f32i16(<4 x float> %x) nounwind { ; SSE-LABEL: stest_f32i16: ; SSE: # %bb.0: # %entry ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: packssdw %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: stest_f32i16: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: stest_f32i16: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767] +; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] +; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: stest_f32i16: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %conv = fptosi <4 x float> %x to <4 x i32> %0 = icmp slt <4 x i32> %conv, @@ -1214,7 +1283,8 @@ define <4 x i16> @utest_f32i16(<4 x float> %x) nounwind { ; AVX512-LABEL: utest_f32i16: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vcvttps2udq %xmm0, %xmm0 -; AVX512-NEXT: vpmovusdw %xmm0, %xmm0 +; AVX512-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: retq entry: %conv = fptoui <4 x float> %x to <4 x i32> @@ -1243,11 +1313,24 @@ define <4 x i16> @ustest_f32i16(<4 x float> %x) nounwind { ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: retq ; -; AVX-LABEL: ustest_f32i16: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: ustest_f32i16: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535] +; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: ustest_f32i16: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %conv = fptosi <4 x float> %x to <4 x i32> %0 = icmp slt <4 x i32> %conv, @@ -1308,7 +1391,30 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) nounwind { ; SSE-NEXT: cvttps2dq %xmm1, %xmm0 ; SSE-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0] -; SSE-NEXT: packssdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pcmpgtd %xmm3, %xmm0 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: packssdw %xmm3, %xmm0 ; SSE-NEXT: addq $72, %rsp ; SSE-NEXT: retq ; @@ -1316,6 +1422,10 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) nounwind { ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vcvtph2ps %xmm0, %ymm0 ; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528] +; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -1549,6 +1659,10 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) nounwind { ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vcvtph2ps %xmm0, %ymm0 ; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -1579,16 +1693,40 @@ define <2 x i8> @stest_f64i8(<2 x double> %x) nounwind { ; SSE-LABEL: stest_f64i8: ; SSE: # %bb.0: # %entry ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [127,127,u,u] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [4294967168,4294967168,u,u] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: packssdw %xmm0, %xmm0 ; SSE-NEXT: packsswb %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: stest_f64i8: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: stest_f64i8: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127] +; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168] +; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: stest_f64i8: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpmovdb %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i32> %0 = icmp slt <2 x i32> %conv, @@ -1651,16 +1789,39 @@ define <2 x i8> @ustest_f64i8(<2 x double> %x) nounwind { ; SSE-LABEL: ustest_f64i8: ; SSE: # %bb.0: # %entry ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: packssdw %xmm0, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,u,u] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: ustest_f64i8: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: ustest_f64i8: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255] +; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: ustest_f64i8: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovdb %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i32> %0 = icmp slt <2 x i32> %conv, @@ -1675,16 +1836,39 @@ define <4 x i8> @stest_f32i8(<4 x float> %x) nounwind { ; SSE-LABEL: stest_f32i8: ; SSE: # %bb.0: # %entry ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: packssdw %xmm0, %xmm0 ; SSE-NEXT: packsswb %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: stest_f32i8: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: stest_f32i8: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127] +; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168] +; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-NEXT: retq +; +; AVX512-LABEL: stest_f32i8: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpmovdb %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %conv = fptosi <4 x float> %x to <4 x i32> %0 = icmp slt <4 x i32> %conv, @@ -1733,7 +1917,8 @@ define <4 x i8> @utest_f32i8(<4 x float> %x) nounwind { ; AVX512-LABEL: utest_f32i8: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vcvttps2udq %xmm0, %xmm0 -; AVX512-NEXT: vpmovusdb %xmm0, %xmm0 +; AVX512-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpmovdb %xmm0, %xmm0 ; AVX512-NEXT: retq entry: %conv = fptoui <4 x float> %x to <4 x i32> @@ -1747,16 +1932,38 @@ define <4 x i8> @ustest_f32i8(<4 x float> %x) nounwind { ; SSE-LABEL: ustest_f32i8: ; SSE: # %bb.0: # %entry ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: packssdw %xmm0, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: ustest_f32i8: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: ustest_f32i8: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255] +; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-NEXT: retq +; +; AVX512-LABEL: ustest_f32i8: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovdb %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %conv = fptosi <4 x float> %x to <4 x i32> %0 = icmp slt <4 x i32> %conv, @@ -2688,7 +2895,9 @@ define <2 x i32> @stest_f64i32_mm(<2 x double> %x) nounwind { ; AVX512-LABEL: stest_f64i32_mm: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vcvttpd2qq %xmm0, %xmm0 -; AVX512-NEXT: vpmovsqd %xmm0, %xmm0 +; AVX512-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i64> @@ -2765,7 +2974,8 @@ define <2 x i32> @utest_f64i32_mm(<2 x double> %x) nounwind { ; AVX512-LABEL: utest_f64i32_mm: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vcvttpd2uqq %xmm0, %xmm0 -; AVX512-NEXT: vpmovusqd %xmm0, %xmm0 +; AVX512-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512-NEXT: retq entry: %conv = fptoui <2 x double> %x to <2 x i64> @@ -2831,9 +3041,10 @@ define <2 x i32> @ustest_f64i32_mm(<2 x double> %x) nounwind { ; AVX512-LABEL: ustest_f64i32_mm: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vcvttpd2qq %xmm0, %xmm0 +; AVX512-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmovusqd %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i64> @@ -2951,7 +3162,9 @@ define <4 x i32> @stest_f32i32_mm(<4 x float> %x) nounwind { ; AVX512-LABEL: stest_f32i32_mm: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vcvttps2qq %xmm0, %ymm0 -; AVX512-NEXT: vpmovsqd %ymm0, %xmm0 +; AVX512-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 +; AVX512-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 +; AVX512-NEXT: vpmovqd %ymm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq entry: @@ -3090,7 +3303,8 @@ define <4 x i32> @utest_f32i32_mm(<4 x float> %x) nounwind { ; AVX512-LABEL: utest_f32i32_mm: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vcvttps2uqq %xmm0, %ymm0 -; AVX512-NEXT: vpmovusqd %ymm0, %xmm0 +; AVX512-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 +; AVX512-NEXT: vpmovqd %ymm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq entry: @@ -3202,9 +3416,10 @@ define <4 x i32> @ustest_f32i32_mm(<4 x float> %x) nounwind { ; AVX512-LABEL: ustest_f32i32_mm: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vcvttps2qq %xmm0, %ymm0 +; AVX512-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpmovusqd %ymm0, %xmm0 +; AVX512-NEXT: vpmovqd %ymm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq entry: @@ -3665,14 +3880,38 @@ define <2 x i16> @stest_f64i16_mm(<2 x double> %x) nounwind { ; SSE-LABEL: stest_f64i16_mm: ; SSE: # %bb.0: # %entry ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: packssdw %xmm0, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,u,u] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [4294934528,4294934528,u,u] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] ; SSE-NEXT: retq ; -; AVX-LABEL: stest_f64i16_mm: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: stest_f64i16_mm: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767] +; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] +; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-NEXT: retq +; +; AVX512-LABEL: stest_f64i16_mm: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpmovdw %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i32> %spec.store.select = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %conv, <2 x i32> ) @@ -3744,11 +3983,24 @@ define <2 x i16> @ustest_f64i16_mm(<2 x double> %x) nounwind { ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] ; SSE-NEXT: retq ; -; AVX-LABEL: ustest_f64i16_mm: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: ustest_f64i16_mm: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535] +; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-NEXT: retq +; +; AVX512-LABEL: ustest_f64i16_mm: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovdw %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i32> %spec.store.select = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %conv, <2 x i32> ) @@ -3761,14 +4013,38 @@ define <4 x i16> @stest_f32i16_mm(<4 x float> %x) nounwind { ; SSE-LABEL: stest_f32i16_mm: ; SSE: # %bb.0: # %entry ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: packssdw %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: stest_f32i16_mm: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: stest_f32i16_mm: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767] +; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] +; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: stest_f32i16_mm: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %conv = fptosi <4 x float> %x to <4 x i32> %spec.store.select = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %conv, <4 x i32> ) @@ -3816,7 +4092,8 @@ define <4 x i16> @utest_f32i16_mm(<4 x float> %x) nounwind { ; AVX512-LABEL: utest_f32i16_mm: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vcvttps2udq %xmm0, %xmm0 -; AVX512-NEXT: vpmovusdw %xmm0, %xmm0 +; AVX512-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: retq entry: %conv = fptoui <4 x float> %x to <4 x i32> @@ -3844,11 +4121,24 @@ define <4 x i16> @ustest_f32i16_mm(<4 x float> %x) nounwind { ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: retq ; -; AVX-LABEL: ustest_f32i16_mm: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: ustest_f32i16_mm: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535] +; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: ustest_f32i16_mm: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %conv = fptosi <4 x float> %x to <4 x i32> %spec.store.select = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %conv, <4 x i32> ) @@ -3907,7 +4197,30 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) nounwind { ; SSE-NEXT: cvttps2dq %xmm1, %xmm0 ; SSE-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0] -; SSE-NEXT: packssdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pcmpgtd %xmm3, %xmm0 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: packssdw %xmm3, %xmm0 ; SSE-NEXT: addq $72, %rsp ; SSE-NEXT: retq ; @@ -3915,6 +4228,10 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) nounwind { ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vcvtph2ps %xmm0, %ymm0 ; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528] +; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -4145,6 +4462,10 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) nounwind { ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vcvtph2ps %xmm0, %ymm0 ; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll index bdb7c307a5759..96644f7b446d5 100644 --- a/llvm/test/CodeGen/X86/madd.ll +++ b/llvm/test/CodeGen/X86/madd.ll @@ -2904,12 +2904,20 @@ define i32 @sum_of_square_differences(ptr %a, ptr %b, i32 %n) { ; SSE2-NEXT: .LBB34_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 ; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: psubw %xmm2, %xmm3 -; SSE2-NEXT: pmaddwd %xmm3, %xmm3 -; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movq {{.*#+}} xmm4 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE2-NEXT: psubd %xmm3, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT: psubd %xmm2, %xmm4 +; SSE2-NEXT: packssdw %xmm4, %xmm5 +; SSE2-NEXT: pmaddwd %xmm5, %xmm5 +; SSE2-NEXT: paddd %xmm5, %xmm1 ; SSE2-NEXT: addq $8, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB34_1 @@ -2930,9 +2938,13 @@ define i32 @sum_of_square_differences(ptr %a, ptr %b, i32 %n) { ; AVX1-NEXT: .p2align 4 ; AVX1-NEXT: .LBB34_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmaddwd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -2950,32 +2962,62 @@ define i32 @sum_of_square_differences(ptr %a, ptr %b, i32 %n) { ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX256-LABEL: sum_of_square_differences: -; AVX256: # %bb.0: # %entry -; AVX256-NEXT: movl %edx, %eax -; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX256-NEXT: xorl %ecx, %ecx -; AVX256-NEXT: .p2align 4 -; AVX256-NEXT: .LBB34_1: # %vector.body -; AVX256-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX256-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX256-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX256-NEXT: vpsubw %xmm1, %xmm2, %xmm1 -; AVX256-NEXT: vpmaddwd %xmm1, %xmm1, %xmm1 -; AVX256-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; AVX256-NEXT: addq $8, %rcx -; AVX256-NEXT: cmpq %rcx, %rax -; AVX256-NEXT: jne .LBB34_1 -; AVX256-NEXT: # %bb.2: # %middle.block -; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX256-NEXT: vmovd %xmm0, %eax -; AVX256-NEXT: vzeroupper -; AVX256-NEXT: retq +; AVX2-LABEL: sum_of_square_differences: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: .p2align 4 +; AVX2-NEXT: .LBB34_1: # %vector.body +; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmaddwd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: addq $8, %rcx +; AVX2-NEXT: cmpq %rcx, %rax +; AVX2-NEXT: jne .LBB34_1 +; AVX2-NEXT: # %bb.2: # %middle.block +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: sum_of_square_differences: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: .p2align 4 +; AVX512-NEXT: .LBB34_1: # %vector.body +; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX512-NEXT: vpsubd %ymm1, %ymm2, %ymm1 +; AVX512-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512-NEXT: vpmaddwd %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: addq $8, %rcx +; AVX512-NEXT: cmpq %rcx, %rax +; AVX512-NEXT: jne .LBB34_1 +; AVX512-NEXT: # %bb.2: # %middle.block +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %0 = zext i32 %n to i64 br label %vector.body diff --git a/llvm/test/CodeGen/X86/pr92569.ll b/llvm/test/CodeGen/X86/pr92569.ll index 5f306e998398f..f13e0313206d2 100644 --- a/llvm/test/CodeGen/X86/pr92569.ll +++ b/llvm/test/CodeGen/X86/pr92569.ll @@ -9,6 +9,7 @@ define void @PR92569(i64 %arg, <8 x i8> %arg1) { ; CHECK-NEXT: shrb $3, %al ; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: andl $15, %eax ; CHECK-NEXT: movzbl -24(%rsp,%rax), %eax ; CHECK-NEXT: movl %eax, 0 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll index da8a3f3fa0d4e..ab0c2a646f868 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll @@ -91,42 +91,47 @@ define <2 x i32> @trunc_packus_v2i64_v2i32(<2 x i64> %a0) { ; AVX512F-LABEL: trunc_packus_v2i64_v2i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4294967295,0,4294967295,0] +; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovusqd %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_packus_v2i64_v2i32: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovusqd %xmm0, %xmm0 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_packus_v2i64_v2i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4294967295,0,4294967295,0] +; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovusqd %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_packus_v2i64_v2i32: ; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovusqd %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_packus_v2i64_v2i32: ; SKX: # %bb.0: +; SKX-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; SKX-NEXT: vpmovusqd %xmm0, %xmm0 +; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX-NEXT: retq %1 = icmp slt <2 x i64> %a0, %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> @@ -211,9 +216,11 @@ define void @trunc_packus_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) { ; AVX512F-LABEL: trunc_packus_v2i64_v2i32_store: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4294967295,0,4294967295,0] +; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovusqd %zmm0, %ymm0 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512F-NEXT: vmovq %xmm0, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -228,9 +235,11 @@ define void @trunc_packus_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) { ; AVX512BW-LABEL: trunc_packus_v2i64_v2i32_store: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4294967295,0,4294967295,0] +; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovusqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512BW-NEXT: vmovq %xmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -421,44 +430,49 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { ; AVX512F-LABEL: trunc_packus_v4i64_v4i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovusqd %zmm0, %ymm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_packus_v4i64_v4i32: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovusqd %ymm0, %xmm0 +; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_packus_v4i64_v4i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovusqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_packus_v4i64_v4i32: ; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovusqd %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_packus_v4i64_v4i32: ; SKX: # %bb.0: +; SKX-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 -; SKX-NEXT: vpmovusqd %ymm0, %xmm0 +; SKX-NEXT: vpmovqd %ymm0, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %1 = icmp slt <4 x i64> %a0, @@ -718,18 +732,23 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="25 ; ; AVX512-LABEL: trunc_packus_v8i64_v8i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vpmaxsq (%rdi), %zmm0, %zmm0 -; AVX512-NEXT: vpmovusqd %zmm0, %ymm0 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: retq ; ; SKX-LABEL: trunc_packus_v8i64_v8i32: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; SKX-NEXT: vpmaxsq (%rdi), %ymm0, %ymm1 -; SKX-NEXT: vpmovusqd %ymm1, %xmm1 -; SKX-NEXT: vpmaxsq 32(%rdi), %ymm0, %ymm0 -; SKX-NEXT: vpmovusqd %ymm0, %xmm0 +; SKX-NEXT: vpbroadcastq {{.*#+}} ymm0 = [4294967295,4294967295,4294967295,4294967295] +; SKX-NEXT: vpminsq (%rdi), %ymm0, %ymm1 +; SKX-NEXT: vpminsq 32(%rdi), %ymm0, %ymm0 +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; SKX-NEXT: vpmaxsq %ymm2, %ymm0, %ymm0 +; SKX-NEXT: vpmaxsq %ymm2, %ymm1, %ymm1 +; SKX-NEXT: vpmovqd %ymm1, %xmm1 +; SKX-NEXT: vpmovqd %ymm0, %xmm0 ; SKX-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; SKX-NEXT: retq %a0 = load <8 x i64>, ptr %p0 @@ -844,40 +863,49 @@ define <2 x i16> @trunc_packus_v2i64_v2i16(<2 x i64> %a0) { ; AVX512F-LABEL: trunc_packus_v2i64_v2i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm1 = [65535,65535] +; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_packus_v2i64_v2i16: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovusqw %xmm0, %xmm0 +; AVX512VL-NEXT: vpmovqw %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_packus_v2i64_v2i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = [65535,65535] +; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_packus_v2i64_v2i16: ; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovusqw %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmovqw %xmm0, %xmm0 ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_packus_v2i64_v2i16: ; SKX: # %bb.0: +; SKX-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; SKX-NEXT: vpmovusqw %xmm0, %xmm0 +; SKX-NEXT: vpmovqw %xmm0, %xmm0 ; SKX-NEXT: retq %1 = icmp slt <2 x i64> %a0, %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> @@ -991,9 +1019,12 @@ define void @trunc_packus_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) { ; AVX512F-LABEL: trunc_packus_v2i64_v2i16_store: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm1 = [65535,65535] +; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1008,9 +1039,12 @@ define void @trunc_packus_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) { ; AVX512BW-LABEL: trunc_packus_v2i64_v2i16_store: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = [65535,65535] +; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1180,9 +1214,10 @@ define <4 x i16> @trunc_packus_v4i64_v4i16(<4 x i64> %a0) { ; AVX512F-LABEL: trunc_packus_v4i64_v4i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1197,9 +1232,10 @@ define <4 x i16> @trunc_packus_v4i64_v4i16(<4 x i64> %a0) { ; AVX512BW-LABEL: trunc_packus_v4i64_v4i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -1372,9 +1408,10 @@ define void @trunc_packus_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; AVX512F-LABEL: trunc_packus_v4i64_v4i16_store: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1390,9 +1427,10 @@ define void @trunc_packus_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; AVX512BW-LABEL: trunc_packus_v4i64_v4i16_store: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1678,9 +1716,11 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="25 ; ; AVX512-LABEL: trunc_packus_v8i64_v8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vpmaxsq (%rdi), %zmm0, %zmm0 -; AVX512-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -1738,21 +1778,68 @@ define <4 x i16> @trunc_packus_v4i32_v4i16(<4 x i32> %a0) { ; ; SSE41-LABEL: trunc_packus_v4i32_v4i16: ; SSE41: # %bb.0: +; SSE41-NEXT: pminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pmaxsd %xmm1, %xmm0 ; SSE41-NEXT: packusdw %xmm0, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_packus_v4i32_v4i16: -; AVX: # %bb.0: -; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: trunc_packus_v4i32_v4i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq ; -; AVX512-LABEL: trunc_packus_v4i32_v4i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX2-LABEL: trunc_packus_v4i32_v4i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535] +; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_packus_v4i32_v4i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535] +; AVX512F-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_packus_v4i32_v4i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_packus_v4i32_v4i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535] +; AVX512BW-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_packus_v4i32_v4i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_packus_v4i32_v4i16: ; SKX: # %bb.0: +; SKX-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; SKX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; SKX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; SKX-NEXT: retq %1 = icmp slt <4 x i32> %a0, @@ -1800,18 +1887,38 @@ define void @trunc_packus_v4i32_v4i16_store(<4 x i32> %a0, ptr%p1) { ; ; SSE41-LABEL: trunc_packus_v4i32_v4i16_store: ; SSE41: # %bb.0: -; SSE41-NEXT: packusdw %xmm0, %xmm0 -; SSE41-NEXT: movq %xmm0, (%rdi) +; SSE41-NEXT: pminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pmaxsd %xmm0, %xmm1 +; SSE41-NEXT: packusdw %xmm1, %xmm1 +; SSE41-NEXT: movq %xmm1, (%rdi) ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_packus_v4i32_v4i16_store: -; AVX: # %bb.0: -; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: trunc_packus_v4i32_v4i16_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v4i32_v4i16_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535] +; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_packus_v4i32_v4i16_store: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535] +; AVX512F-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, (%rdi) ; AVX512F-NEXT: retq @@ -1825,6 +1932,10 @@ define void @trunc_packus_v4i32_v4i16_store(<4 x i32> %a0, ptr%p1) { ; ; AVX512BW-LABEL: trunc_packus_v4i32_v4i16_store: ; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535] +; AVX512BW-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, (%rdi) ; AVX512BW-NEXT: retq @@ -1907,18 +2018,34 @@ define <8 x i16> @trunc_packus_v8i32_v8i16(<8 x i32> %a0) { ; ; SSE41-LABEL: trunc_packus_v8i32_v8i16: ; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] +; SSE41-NEXT: pminsd %xmm2, %xmm0 +; SSE41-NEXT: pminsd %xmm2, %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pmaxsd %xmm2, %xmm1 +; SSE41-NEXT: pmaxsd %xmm2, %xmm0 ; SSE41-NEXT: packusdw %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_packus_v8i32_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [65535,65535,65535,65535] +; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_packus_v8i32_v8i16: ; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -1926,39 +2053,50 @@ define <8 x i16> @trunc_packus_v8i32_v8i16(<8 x i32> %a0) { ; ; AVX512F-LABEL: trunc_packus_v8i32_v8i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_packus_v8i32_v8i16: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovusdw %ymm0, %xmm0 +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_packus_v8i32_v8i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i16: ; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovusdw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_packus_v8i32_v8i16: ; SKX: # %bb.0: +; SKX-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; SKX-NEXT: vpmovusdw %ymm0, %xmm0 +; SKX-NEXT: vpmovdw %ymm0, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %1 = icmp slt <8 x i32> %a0, @@ -2073,39 +2211,70 @@ define <16 x i16> @trunc_packus_v16i32_v16i16(ptr %p0) "min-legal-vector-width"= ; ; SSE41-LABEL: trunc_packus_v16i32_v16i16: ; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] ; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: pminsd %xmm2, %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm3 +; SSE41-NEXT: pminsd %xmm2, %xmm3 ; SSE41-NEXT: movdqa 32(%rdi), %xmm1 -; SSE41-NEXT: packusdw 16(%rdi), %xmm0 -; SSE41-NEXT: packusdw 48(%rdi), %xmm1 +; SSE41-NEXT: pminsd %xmm2, %xmm1 +; SSE41-NEXT: pminsd 48(%rdi), %xmm2 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pmaxsd %xmm4, %xmm2 +; SSE41-NEXT: pmaxsd %xmm4, %xmm1 +; SSE41-NEXT: packusdw %xmm2, %xmm1 +; SSE41-NEXT: pmaxsd %xmm4, %xmm3 +; SSE41-NEXT: pmaxsd %xmm4, %xmm0 +; SSE41-NEXT: packusdw %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_packus_v16i32_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX1-NEXT: vpackusdw 48(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vpackusdw 16(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [65535,65535,65535,65535] +; AVX1-NEXT: vpminsd 32(%rdi), %xmm0, %xmm1 +; AVX1-NEXT: vpminsd 48(%rdi), %xmm0, %xmm2 +; AVX1-NEXT: vpminsd (%rdi), %xmm0, %xmm3 +; AVX1-NEXT: vpminsd 16(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpmaxsd %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpmaxsd %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmaxsd %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_packus_v16i32_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpackusdw 32(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX2-NEXT: vpminsd (%rdi), %ymm0, %ymm1 +; AVX2-NEXT: vpminsd 32(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpmaxsd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmaxsd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_packus_v16i32_v16i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vpmaxsd (%rdi), %zmm0, %zmm0 -; AVX512-NEXT: vpmovusdw %zmm0, %ymm0 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: retq ; ; SKX-LABEL: trunc_packus_v16i32_v16i16: ; SKX: # %bb.0: -; SKX-NEXT: vmovdqa (%rdi), %ymm0 -; SKX-NEXT: vpackusdw 32(%rdi), %ymm0, %ymm0 +; SKX-NEXT: vpbroadcastd {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,65535,65535,65535] +; SKX-NEXT: vpminsd (%rdi), %ymm0, %ymm1 +; SKX-NEXT: vpminsd 32(%rdi), %ymm0, %ymm0 +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; SKX-NEXT: vpmaxsd %ymm2, %ymm0, %ymm0 +; SKX-NEXT: vpmaxsd %ymm2, %ymm1, %ymm1 +; SKX-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 ; SKX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; SKX-NEXT: retq %a0 = load <16 x i32>, ptr %p0 @@ -2228,40 +2397,51 @@ define <2 x i8> @trunc_packus_v2i64_v2i8(<2 x i64> %a0) { ; AVX512F-LABEL: trunc_packus_v2i64_v2i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,255] +; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_packus_v2i64_v2i8: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovusqb %xmm0, %xmm0 +; AVX512VL-NEXT: vpmovqb %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_packus_v2i64_v2i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,255] +; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_packus_v2i64_v2i8: ; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovusqb %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmovqb %xmm0, %xmm0 ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_packus_v2i64_v2i8: ; SKX: # %bb.0: +; SKX-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; SKX-NEXT: vpmovusqb %xmm0, %xmm0 +; SKX-NEXT: vpmovqb %xmm0, %xmm0 ; SKX-NEXT: retq %1 = icmp slt <2 x i64> %a0, %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> @@ -2383,9 +2563,13 @@ define void @trunc_packus_v2i64_v2i8_store(<2 x i64> %a0, ptr%p1) { ; AVX512F-LABEL: trunc_packus_v2i64_v2i8_store: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,255] +; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2400,9 +2584,13 @@ define void @trunc_packus_v2i64_v2i8_store(<2 x i64> %a0, ptr%p1) { ; AVX512BW-LABEL: trunc_packus_v2i64_v2i8_store: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,255] +; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2576,9 +2764,10 @@ define <4 x i8> @trunc_packus_v4i64_v4i8(<4 x i64> %a0) { ; AVX512F-LABEL: trunc_packus_v4i64_v4i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2593,9 +2782,10 @@ define <4 x i8> @trunc_packus_v4i64_v4i8(<4 x i64> %a0) { ; AVX512BW-LABEL: trunc_packus_v4i64_v4i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -2772,9 +2962,10 @@ define void @trunc_packus_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; AVX512F-LABEL: trunc_packus_v4i64_v4i8_store: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2790,9 +2981,10 @@ define void @trunc_packus_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; AVX512BW-LABEL: trunc_packus_v4i64_v4i8_store: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -3920,28 +4112,101 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2 } define <4 x i8> @trunc_packus_v4i32_v4i8(<4 x i32> %a0) "min-legal-vector-width"="256" { -; SSE-LABEL: trunc_packus_v4i32_v4i8: -; SSE: # %bb.0: -; SSE-NEXT: packssdw %xmm0, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: trunc_packus_v4i32_v4i8: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255] +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: packuswb %xmm0, %xmm0 +; SSE2-SSSE3-NEXT: packuswb %xmm0, %xmm0 +; SSE2-SSSE3-NEXT: retq ; -; AVX-LABEL: trunc_packus_v4i32_v4i8: -; AVX: # %bb.0: -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE41-LABEL: trunc_packus_v4i32_v4i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pmaxsd %xmm1, %xmm0 +; SSE41-NEXT: packusdw %xmm0, %xmm0 +; SSE41-NEXT: packuswb %xmm0, %xmm0 +; SSE41-NEXT: retq ; -; AVX512-LABEL: trunc_packus_v4i32_v4i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX1-LABEL: trunc_packus_v4i32_v4i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_packus_v4i32_v4i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255] +; AVX2-SLOW-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_packus_v4i32_v4i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255] +; AVX2-FAST-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: trunc_packus_v4i32_v4i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255] +; AVX512F-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_packus_v4i32_v4i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmovdb %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_packus_v4i32_v4i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255] +; AVX512BW-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_packus_v4i32_v4i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmovdb %xmm0, %xmm0 +; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_packus_v4i32_v4i8: ; SKX: # %bb.0: -; SKX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; SKX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; SKX-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; SKX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; SKX-NEXT: vpmovdb %xmm0, %xmm0 ; SKX-NEXT: retq %1 = icmp slt <4 x i32> %a0, %2 = select <4 x i1> %1, <4 x i32> %a0, <4 x i32> @@ -3952,23 +4217,71 @@ define <4 x i8> @trunc_packus_v4i32_v4i8(<4 x i32> %a0) "min-legal-vector-width" } define void @trunc_packus_v4i32_v4i8_store(<4 x i32> %a0, ptr%p1) { -; SSE-LABEL: trunc_packus_v4i32_v4i8_store: -; SSE: # %bb.0: -; SSE-NEXT: packssdw %xmm0, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movd %xmm0, (%rdi) -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: trunc_packus_v4i32_v4i8_store: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255] +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: por %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm1 +; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm1 +; SSE2-SSSE3-NEXT: movd %xmm1, (%rdi) +; SSE2-SSSE3-NEXT: retq ; -; AVX-LABEL: trunc_packus_v4i32_v4i8_store: -; AVX: # %bb.0: -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, (%rdi) -; AVX-NEXT: retq +; SSE41-LABEL: trunc_packus_v4i32_v4i8_store: +; SSE41: # %bb.0: +; SSE41-NEXT: pminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pmaxsd %xmm0, %xmm1 +; SSE41-NEXT: packusdw %xmm1, %xmm1 +; SSE41-NEXT: packuswb %xmm1, %xmm1 +; SSE41-NEXT: movd %xmm1, (%rdi) +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_packus_v4i32_v4i8_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_packus_v4i32_v4i8_store: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255] +; AVX2-SLOW-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovd %xmm0, (%rdi) +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_packus_v4i32_v4i8_store: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255] +; AVX2-FAST-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovd %xmm0, (%rdi) +; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: trunc_packus_v4i32_v4i8_store: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255] +; AVX512F-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, (%rdi) ; AVX512F-NEXT: retq @@ -3982,7 +4295,11 @@ define void @trunc_packus_v4i32_v4i8_store(<4 x i32> %a0, ptr%p1) { ; ; AVX512BW-LABEL: trunc_packus_v4i32_v4i8_store: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255] +; AVX512BW-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, (%rdi) ; AVX512BW-NEXT: retq @@ -4010,33 +4327,75 @@ define void @trunc_packus_v4i32_v4i8_store(<4 x i32> %a0, ptr%p1) { } define <8 x i8> @trunc_packus_v8i32_v8i8(<8 x i32> %a0) { -; SSE-LABEL: trunc_packus_v8i32_v8i8: -; SSE: # %bb.0: -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: trunc_packus_v8i32_v8i8: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] +; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: por %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: packuswb %xmm0, %xmm0 +; SSE2-SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_packus_v8i32_v8i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = [255,255,255,255] +; SSE41-NEXT: pminsd %xmm2, %xmm0 +; SSE41-NEXT: pminsd %xmm2, %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pmaxsd %xmm2, %xmm1 +; SSE41-NEXT: pmaxsd %xmm2, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: packuswb %xmm0, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_packus_v8i32_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255] +; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_packus_v8i32_v8i8: ; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_packus_v8i32_v8i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4050,9 +4409,11 @@ define <8 x i8> @trunc_packus_v8i32_v8i8(<8 x i32> %a0) { ; ; AVX512BW-LABEL: trunc_packus_v8i32_v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -4080,17 +4441,54 @@ define <8 x i8> @trunc_packus_v8i32_v8i8(<8 x i32> %a0) { } define void @trunc_packus_v8i32_v8i8_store(<8 x i32> %a0, ptr%p1) { -; SSE-LABEL: trunc_packus_v8i32_v8i8_store: -; SSE: # %bb.0: -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movq %xmm0, (%rdi) -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: trunc_packus_v8i32_v8i8_store: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] +; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: por %xmm0, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1 +; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: por %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: packuswb %xmm0, %xmm0 +; SSE2-SSSE3-NEXT: movq %xmm0, (%rdi) +; SSE2-SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_packus_v8i32_v8i8_store: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = [255,255,255,255] +; SSE41-NEXT: pminsd %xmm2, %xmm0 +; SSE41-NEXT: pminsd %xmm2, %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pmaxsd %xmm2, %xmm1 +; SSE41-NEXT: pmaxsd %xmm2, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: packuswb %xmm0, %xmm0 +; SSE41-NEXT: movq %xmm0, (%rdi) +; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_packus_v8i32_v8i8_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255] +; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, (%rdi) ; AVX1-NEXT: vzeroupper @@ -4098,8 +4496,12 @@ define void @trunc_packus_v8i32_v8i8_store(<8 x i32> %a0, ptr%p1) { ; ; AVX2-LABEL: trunc_packus_v8i32_v8i8_store: ; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, (%rdi) ; AVX2-NEXT: vzeroupper @@ -4107,9 +4509,11 @@ define void @trunc_packus_v8i32_v8i8_store(<8 x i32> %a0, ptr%p1) { ; ; AVX512F-LABEL: trunc_packus_v8i32_v8i8_store: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -4124,9 +4528,11 @@ define void @trunc_packus_v8i32_v8i8_store(<8 x i32> %a0, ptr%p1) { ; ; AVX512BW-LABEL: trunc_packus_v8i32_v8i8_store: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -4156,28 +4562,97 @@ define void @trunc_packus_v8i32_v8i8_store(<8 x i32> %a0, ptr%p1) { } define <16 x i8> @trunc_packus_v16i32_v16i8(ptr %p0) "min-legal-vector-width"="256" { -; SSE-LABEL: trunc_packus_v16i32_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 32(%rdi), %xmm1 -; SSE-NEXT: packssdw 48(%rdi), %xmm1 -; SSE-NEXT: packssdw 16(%rdi), %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: trunc_packus_v16i32_v16i8: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm3 +; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm0 +; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm4 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255] +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: pand %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm1 +; SSE2-SSSE3-NEXT: por %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm2 +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm4 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm2 +; SSE2-SSSE3-NEXT: por %xmm4, %xmm2 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm4 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm4 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm4 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: por %xmm0, %xmm3 +; SSE2-SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 +; SSE2-SSSE3-NEXT: packuswb %xmm6, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm2 +; SSE2-SSSE3-NEXT: pand %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_packus_v16i32_v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = [255,255,255,255] +; SSE41-NEXT: movdqa 32(%rdi), %xmm2 +; SSE41-NEXT: pminsd %xmm1, %xmm2 +; SSE41-NEXT: movdqa 48(%rdi), %xmm3 +; SSE41-NEXT: pminsd %xmm1, %xmm3 +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: pminsd %xmm1, %xmm0 +; SSE41-NEXT: pminsd 16(%rdi), %xmm1 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pmaxsd %xmm4, %xmm1 +; SSE41-NEXT: pmaxsd %xmm4, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: pmaxsd %xmm4, %xmm3 +; SSE41-NEXT: pmaxsd %xmm4, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: packuswb %xmm2, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_packus_v16i32_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX1-NEXT: vpackssdw 48(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw 16(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [255,255,255,255] +; AVX1-NEXT: vpminsd 32(%rdi), %xmm0, %xmm1 +; AVX1-NEXT: vpminsd 48(%rdi), %xmm0, %xmm2 +; AVX1-NEXT: vpminsd (%rdi), %xmm0, %xmm3 +; AVX1-NEXT: vpminsd 16(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpmaxsd %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpmaxsd %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmaxsd %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_packus_v16i32_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpackssdw 32(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpminsd (%rdi), %ymm0, %ymm1 +; AVX2-NEXT: vpminsd 32(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpmaxsd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmaxsd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] @@ -4186,18 +4661,22 @@ define <16 x i8> @trunc_packus_v16i32_v16i8(ptr %p0) "min-legal-vector-width"="2 ; ; AVX512-LABEL: trunc_packus_v16i32_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vpmaxsd (%rdi), %zmm0, %zmm0 -; AVX512-NEXT: vpmovusdb %zmm0, %xmm0 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; SKX-LABEL: trunc_packus_v16i32_v16i8: ; SKX: # %bb.0: -; SKX-NEXT: vmovdqa (%rdi), %ymm0 -; SKX-NEXT: vpackusdw 32(%rdi), %ymm0, %ymm0 -; SKX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; SKX-NEXT: vpmovuswb %ymm0, %xmm0 +; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; SKX-NEXT: vpmaxsd 32(%rdi), %ymm0, %ymm1 +; SKX-NEXT: vpmovusdb %ymm1, %xmm1 +; SKX-NEXT: vpmaxsd (%rdi), %ymm0, %ymm0 +; SKX-NEXT: vpmovusdb %ymm0, %xmm0 +; SKX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %a0 = load <16 x i32>, ptr %p0 @@ -4210,30 +4689,100 @@ define <16 x i8> @trunc_packus_v16i32_v16i8(ptr %p0) "min-legal-vector-width"="2 } define void @trunc_packus_v16i32_v16i8_store(ptr %p0, ptr %p1) "min-legal-vector-width"="256" { -; SSE-LABEL: trunc_packus_v16i32_v16i8_store: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 32(%rdi), %xmm1 -; SSE-NEXT: packssdw 48(%rdi), %xmm1 -; SSE-NEXT: packssdw 16(%rdi), %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsi) -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: trunc_packus_v16i32_v16i8_store: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm3 +; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm2 +; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm1 +; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm4 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255] +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: por %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE2-SSSE3-NEXT: pand %xmm1, %xmm4 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm1 +; SSE2-SSSE3-NEXT: por %xmm4, %xmm1 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm4 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm4 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm4 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: por %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm5 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm5 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm5 +; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: packuswb %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-SSSE3-NEXT: pand %xmm1, %xmm4 +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1 +; SSE2-SSSE3-NEXT: packuswb %xmm4, %xmm1 +; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm3, (%rsi) +; SSE2-SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_packus_v16i32_v16i8_store: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = [255,255,255,255] +; SSE41-NEXT: movdqa 32(%rdi), %xmm1 +; SSE41-NEXT: pminsd %xmm0, %xmm1 +; SSE41-NEXT: movdqa 48(%rdi), %xmm2 +; SSE41-NEXT: pminsd %xmm0, %xmm2 +; SSE41-NEXT: movdqa (%rdi), %xmm3 +; SSE41-NEXT: pminsd %xmm0, %xmm3 +; SSE41-NEXT: pminsd 16(%rdi), %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pmaxsd %xmm4, %xmm0 +; SSE41-NEXT: pmaxsd %xmm4, %xmm3 +; SSE41-NEXT: packusdw %xmm0, %xmm3 +; SSE41-NEXT: pmaxsd %xmm4, %xmm2 +; SSE41-NEXT: pmaxsd %xmm4, %xmm1 +; SSE41-NEXT: packusdw %xmm2, %xmm1 +; SSE41-NEXT: packuswb %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm3, (%rsi) +; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_packus_v16i32_v16i8_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX1-NEXT: vpackssdw 48(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw 16(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [255,255,255,255] +; AVX1-NEXT: vpminsd 32(%rdi), %xmm0, %xmm1 +; AVX1-NEXT: vpminsd 48(%rdi), %xmm0, %xmm2 +; AVX1-NEXT: vpminsd (%rdi), %xmm0, %xmm3 +; AVX1-NEXT: vpminsd 16(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpmaxsd %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpmaxsd %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmaxsd %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, (%rsi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_packus_v16i32_v16i8_store: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpackssdw 32(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpminsd (%rdi), %ymm0, %ymm1 +; AVX2-NEXT: vpminsd 32(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpmaxsd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmaxsd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] @@ -4251,10 +4800,13 @@ define void @trunc_packus_v16i32_v16i8_store(ptr %p0, ptr %p1) "min-legal-vector ; ; SKX-LABEL: trunc_packus_v16i32_v16i8_store: ; SKX: # %bb.0: -; SKX-NEXT: vmovdqa (%rdi), %ymm0 -; SKX-NEXT: vpackusdw 32(%rdi), %ymm0, %ymm0 -; SKX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; SKX-NEXT: vpmovuswb %ymm0, (%rsi) +; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; SKX-NEXT: vpmaxsd 32(%rdi), %ymm0, %ymm1 +; SKX-NEXT: vpmovusdb %ymm1, %xmm1 +; SKX-NEXT: vpmaxsd (%rdi), %ymm0, %ymm0 +; SKX-NEXT: vpmovusdb %ymm0, %xmm0 +; SKX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SKX-NEXT: vmovdqa %xmm0, (%rsi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %a = load <16 x i32>, ptr %p0 @@ -4270,21 +4822,33 @@ define void @trunc_packus_v16i32_v16i8_store(ptr %p0, ptr %p1) "min-legal-vector define <8 x i8> @trunc_packus_v8i16_v8i8(<8 x i16> %a0) { ; SSE-LABEL: trunc_packus_v8i16_v8i8: ; SSE: # %bb.0: +; SSE-NEXT: pminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pmaxsw %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: trunc_packus_v8i16_v8i8: ; AVX: # %bb.0: +; AVX-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: trunc_packus_v8i16_v8i8: ; AVX512: # %bb.0: +; AVX512-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; SKX-LABEL: trunc_packus_v8i16_v8i8: ; SKX: # %bb.0: +; SKX-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; SKX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 ; SKX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; SKX-NEXT: retq %1 = icmp slt <8 x i16> %a0, @@ -4298,30 +4862,45 @@ define <8 x i8> @trunc_packus_v8i16_v8i8(<8 x i16> %a0) { define void @trunc_packus_v8i16_v8i8_store(<8 x i16> %a0, ptr%p1) { ; SSE-LABEL: trunc_packus_v8i16_v8i8_store: ; SSE: # %bb.0: -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movq %xmm0, (%rdi) +; SSE-NEXT: pminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pmaxsw %xmm0, %xmm1 +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movq %xmm1, (%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: trunc_packus_v8i16_v8i8_store: ; AVX: # %bb.0: +; AVX-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vmovq %xmm0, (%rdi) ; AVX-NEXT: retq ; ; AVX512F-LABEL: trunc_packus_v8i16_v8i8_store: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, (%rdi) ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_packus_v8i16_v8i8_store: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, (%rdi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_packus_v8i16_v8i8_store: ; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, (%rdi) ; AVX512BW-NEXT: retq @@ -4349,20 +4928,46 @@ define void @trunc_packus_v8i16_v8i8_store(<8 x i16> %a0, ptr%p1) { } define <16 x i8> @trunc_packus_v16i16_v16i8(<16 x i16> %a0) { -; SSE-LABEL: trunc_packus_v16i16_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: trunc_packus_v16i16_v16i8: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE2-SSSE3-NEXT: pminsw %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: pminsw %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSE2-SSSE3-NEXT: pmaxsw %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: pmaxsw %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_packus_v16i16_v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pminsw %xmm2, %xmm0 +; SSE41-NEXT: pminsw %xmm2, %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pmaxsw %xmm2, %xmm1 +; SSE41-NEXT: pmaxsw %xmm2, %xmm0 +; SSE41-NEXT: packuswb %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_packus_v16i16_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_packus_v16i16_v16i8: ; AVX2: # %bb.0: +; AVX2-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -4370,38 +4975,49 @@ define <16 x i8> @trunc_packus_v16i16_v16i8(<16 x i16> %a0) { ; ; AVX512F-LABEL: trunc_packus_v16i16_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_packus_v16i16_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_packus_v16i16_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_packus_v16i16_v16i8: ; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovuswb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_packus_v16i16_v16i8: ; SKX: # %bb.0: +; SKX-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; SKX-NEXT: vpmovuswb %ymm0, %xmm0 +; SKX-NEXT: vpmovwb %ymm0, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %1 = icmp slt <16 x i16> %a0, @@ -4413,62 +5029,124 @@ define <16 x i8> @trunc_packus_v16i16_v16i8(<16 x i16> %a0) { } define <32 x i8> @trunc_packus_v32i16_v32i8(ptr %p0) "min-legal-vector-width"="256" { -; SSE-LABEL: trunc_packus_v32i16_v32i8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 32(%rdi), %xmm1 -; SSE-NEXT: packuswb 16(%rdi), %xmm0 -; SSE-NEXT: packuswb 48(%rdi), %xmm1 -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: trunc_packus_v32i16_v32i8: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm0 +; SSE2-SSSE3-NEXT: pminsw %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm3 +; SSE2-SSSE3-NEXT: pminsw %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm1 +; SSE2-SSSE3-NEXT: pminsw %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: pminsw 48(%rdi), %xmm2 +; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSE2-SSSE3-NEXT: pmaxsw %xmm4, %xmm2 +; SSE2-SSSE3-NEXT: pmaxsw %xmm4, %xmm1 +; SSE2-SSSE3-NEXT: packuswb %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: pmaxsw %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: pmaxsw %xmm4, %xmm0 +; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_packus_v32i16_v32i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: pminsw %xmm2, %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm3 +; SSE41-NEXT: pminsw %xmm2, %xmm3 +; SSE41-NEXT: movdqa 32(%rdi), %xmm1 +; SSE41-NEXT: pminsw %xmm2, %xmm1 +; SSE41-NEXT: pminsw 48(%rdi), %xmm2 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pmaxsw %xmm4, %xmm2 +; SSE41-NEXT: pmaxsw %xmm4, %xmm1 +; SSE41-NEXT: packuswb %xmm2, %xmm1 +; SSE41-NEXT: pmaxsw %xmm4, %xmm3 +; SSE41-NEXT: pmaxsw %xmm4, %xmm0 +; SSE41-NEXT: packuswb %xmm3, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_packus_v32i16_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX1-NEXT: vpackuswb 48(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb 16(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpminsw 32(%rdi), %xmm0, %xmm1 +; AVX1-NEXT: vpminsw 48(%rdi), %xmm0, %xmm2 +; AVX1-NEXT: vpminsw (%rdi), %xmm0, %xmm3 +; AVX1-NEXT: vpminsw 16(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpmaxsw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpmaxsw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmaxsw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_packus_v32i16_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpackuswb 32(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpminsw (%rdi), %ymm0, %ymm1 +; AVX2-NEXT: vpminsw 32(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_packus_v32i16_v32i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpackuswb 32(%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpminsw (%rdi), %ymm0, %ymm1 +; AVX512F-NEXT: vpminsw 32(%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_packus_v32i16_v32i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vpackuswb 32(%rdi), %ymm0, %ymm0 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpminsw (%rdi), %ymm0, %ymm1 +; AVX512VL-NEXT: vpminsw 32(%rdi), %ymm0, %ymm0 +; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_packus_v32i16_v32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: vpmaxsw (%rdi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovuswb %zmm0, %ymm0 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_packus_v32i16_v32i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmaxsw (%rdi), %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovuswb %zmm0, %ymm0 +; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BWVL-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_packus_v32i16_v32i8: ; SKX: # %bb.0: -; SKX-NEXT: vmovdqa (%rdi), %ymm0 -; SKX-NEXT: vpackuswb 32(%rdi), %ymm0, %ymm0 +; SKX-NEXT: vpbroadcastd {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; SKX-NEXT: vpminsw (%rdi), %ymm0, %ymm1 +; SKX-NEXT: vpminsw 32(%rdi), %ymm0, %ymm0 +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; SKX-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0 +; SKX-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1 +; SKX-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 ; SKX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; SKX-NEXT: retq %a0 = load <32 x i16>, ptr %p0 @@ -4481,42 +5159,169 @@ define <32 x i8> @trunc_packus_v32i16_v32i8(ptr %p0) "min-legal-vector-width"="2 } define <32 x i8> @trunc_packus_v32i32_v32i8(ptr %p0) "min-legal-vector-width"="256" { -; SSE-LABEL: trunc_packus_v32i32_v32i8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: movdqa 64(%rdi), %xmm1 -; SSE-NEXT: movdqa 96(%rdi), %xmm3 -; SSE-NEXT: packssdw 48(%rdi), %xmm2 -; SSE-NEXT: packssdw 16(%rdi), %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: packssdw 112(%rdi), %xmm3 -; SSE-NEXT: packssdw 80(%rdi), %xmm1 -; SSE-NEXT: packuswb %xmm3, %xmm1 -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: trunc_packus_v32i32_v32i8: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: movdqa 80(%rdi), %xmm0 +; SSE2-SSSE3-NEXT: movdqa 64(%rdi), %xmm1 +; SSE2-SSSE3-NEXT: movdqa 112(%rdi), %xmm7 +; SSE2-SSSE3-NEXT: movdqa 96(%rdi), %xmm9 +; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm5 +; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm8 +; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm3 +; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm4 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255] +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm2 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm4 +; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm3 +; SSE2-SSSE3-NEXT: por %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm4 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm4 +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm5 +; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm4 +; SSE2-SSSE3-NEXT: por %xmm5, %xmm4 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm5 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm5 +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm8 +; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm5 +; SSE2-SSSE3-NEXT: por %xmm8, %xmm5 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm9, %xmm8 +; SSE2-SSSE3-NEXT: pand %xmm8, %xmm9 +; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: por %xmm9, %xmm8 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm9 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm9 +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm7 +; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm9 +; SSE2-SSSE3-NEXT: por %xmm7, %xmm9 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm7 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm1 +; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm7 +; SSE2-SSSE3-NEXT: por %xmm1, %xmm7 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm1 +; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 +; SSE2-SSSE3-NEXT: pxor %xmm6, %xmm6 +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm0 +; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm1 +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm1 +; SSE2-SSSE3-NEXT: packuswb %xmm0, %xmm1 +; SSE2-SSSE3-NEXT: movdqa %xmm9, %xmm0 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm0 +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm7 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-SSSE3-NEXT: pand %xmm8, %xmm7 +; SSE2-SSSE3-NEXT: packuswb %xmm0, %xmm7 +; SSE2-SSSE3-NEXT: packuswb %xmm7, %xmm1 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm0 +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 +; SSE2-SSSE3-NEXT: packuswb %xmm7, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm4 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm4 +; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: packuswb %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_packus_v32i32_v32i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm5 = [255,255,255,255] +; SSE41-NEXT: movdqa 32(%rdi), %xmm2 +; SSE41-NEXT: pminsd %xmm5, %xmm2 +; SSE41-NEXT: movdqa 48(%rdi), %xmm3 +; SSE41-NEXT: pminsd %xmm5, %xmm3 +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: pminsd %xmm5, %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm4 +; SSE41-NEXT: pminsd %xmm5, %xmm4 +; SSE41-NEXT: movdqa 96(%rdi), %xmm6 +; SSE41-NEXT: pminsd %xmm5, %xmm6 +; SSE41-NEXT: movdqa 112(%rdi), %xmm7 +; SSE41-NEXT: pminsd %xmm5, %xmm7 +; SSE41-NEXT: movdqa 64(%rdi), %xmm1 +; SSE41-NEXT: pminsd %xmm5, %xmm1 +; SSE41-NEXT: pminsd 80(%rdi), %xmm5 +; SSE41-NEXT: pxor %xmm8, %xmm8 +; SSE41-NEXT: pmaxsd %xmm8, %xmm5 +; SSE41-NEXT: pmaxsd %xmm8, %xmm1 +; SSE41-NEXT: packusdw %xmm5, %xmm1 +; SSE41-NEXT: pmaxsd %xmm8, %xmm7 +; SSE41-NEXT: pmaxsd %xmm8, %xmm6 +; SSE41-NEXT: packusdw %xmm7, %xmm6 +; SSE41-NEXT: packuswb %xmm6, %xmm1 +; SSE41-NEXT: pmaxsd %xmm8, %xmm4 +; SSE41-NEXT: pmaxsd %xmm8, %xmm0 +; SSE41-NEXT: packusdw %xmm4, %xmm0 +; SSE41-NEXT: pmaxsd %xmm8, %xmm3 +; SSE41-NEXT: pmaxsd %xmm8, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: packuswb %xmm2, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_packus_v32i32_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa 96(%rdi), %xmm3 -; AVX1-NEXT: vpackssdw 112(%rdi), %xmm3, %xmm3 -; AVX1-NEXT: vpackssdw 80(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpackssdw 48(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw 16(%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [255,255,255,255] +; AVX1-NEXT: vpminsd 96(%rdi), %xmm0, %xmm1 +; AVX1-NEXT: vpminsd 112(%rdi), %xmm0, %xmm2 +; AVX1-NEXT: vpminsd 64(%rdi), %xmm0, %xmm3 +; AVX1-NEXT: vpminsd 80(%rdi), %xmm0, %xmm4 +; AVX1-NEXT: vpminsd 32(%rdi), %xmm0, %xmm5 +; AVX1-NEXT: vpminsd 48(%rdi), %xmm0, %xmm6 +; AVX1-NEXT: vpminsd (%rdi), %xmm0, %xmm7 +; AVX1-NEXT: vpminsd 16(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 +; AVX1-NEXT: vpmaxsd %xmm8, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsd %xmm8, %xmm7, %xmm7 +; AVX1-NEXT: vpackusdw %xmm0, %xmm7, %xmm0 +; AVX1-NEXT: vpmaxsd %xmm8, %xmm6, %xmm6 +; AVX1-NEXT: vpmaxsd %xmm8, %xmm5, %xmm5 +; AVX1-NEXT: vpackusdw %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpackuswb %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsd %xmm8, %xmm4, %xmm4 +; AVX1-NEXT: vpmaxsd %xmm8, %xmm3, %xmm3 +; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpmaxsd %xmm8, %xmm2, %xmm2 +; AVX1-NEXT: vpmaxsd %xmm8, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_packus_v32i32_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX2-NEXT: vpackssdw 96(%rdi), %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpminsd 64(%rdi), %ymm0, %ymm1 +; AVX2-NEXT: vpminsd 96(%rdi), %ymm0, %ymm2 +; AVX2-NEXT: vpminsd (%rdi), %ymm0, %ymm3 +; AVX2-NEXT: vpminsd 32(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpmaxsd %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpmaxsd %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpackusdw %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpmaxsd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpmaxsd %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpackusdw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] -; AVX2-NEXT: vpackssdw 32(%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] @@ -4524,24 +5329,31 @@ define <32 x i8> @trunc_packus_v32i32_v32i8(ptr %p0) "min-legal-vector-width"="2 ; ; AVX512-LABEL: trunc_packus_v32i32_v32i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vpmaxsd (%rdi), %zmm0, %zmm1 -; AVX512-NEXT: vpmovusdb %zmm1, %xmm1 -; AVX512-NEXT: vpmaxsd 64(%rdi), %zmm0, %zmm0 -; AVX512-NEXT: vpmovusdb %zmm0, %xmm0 +; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512-NEXT: vpminsd (%rdi), %zmm0, %zmm1 +; AVX512-NEXT: vpminsd 64(%rdi), %zmm0, %zmm0 +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpmaxsd %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpmaxsd %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512-NEXT: retq ; ; SKX-LABEL: trunc_packus_v32i32_v32i8: ; SKX: # %bb.0: -; SKX-NEXT: vmovdqa (%rdi), %ymm0 -; SKX-NEXT: vmovdqa 64(%rdi), %ymm1 -; SKX-NEXT: vpackssdw 96(%rdi), %ymm1, %ymm1 -; SKX-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] -; SKX-NEXT: vpackssdw 32(%rdi), %ymm0, %ymm0 -; SKX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; SKX-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 -; SKX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; SKX-NEXT: vpmaxsd 32(%rdi), %ymm0, %ymm1 +; SKX-NEXT: vpmovusdb %ymm1, %xmm1 +; SKX-NEXT: vpmaxsd 96(%rdi), %ymm0, %ymm2 +; SKX-NEXT: vpmovusdb %ymm2, %xmm2 +; SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; SKX-NEXT: vpmaxsd (%rdi), %ymm0, %ymm2 +; SKX-NEXT: vpmovusdb %ymm2, %xmm2 +; SKX-NEXT: vpmaxsd 64(%rdi), %ymm0, %ymm0 +; SKX-NEXT: vpmovusdb %ymm0, %xmm0 +; SKX-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; SKX-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; SKX-NEXT: retq %a0 = load <32 x i32>, ptr %p0 %1 = icmp slt <32 x i32> %a0, diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll index d0cdbf1e3f08d..fcbfc4e274f24 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -106,32 +106,44 @@ define <2 x i32> @trunc_ssat_v2i64_v2i32(<2 x i64> %a0) { ; AVX512F-LABEL: trunc_ssat_v2i64_v2i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpmovsqd %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2147483647,2147483647] +; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_ssat_v2i64_v2i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsqd %xmm0, %xmm0 +; AVX512VL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_ssat_v2i64_v2i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovsqd %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2147483647,2147483647] +; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_ssat_v2i64_v2i32: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovsqd %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_ssat_v2i64_v2i32: ; SKX: # %bb.0: -; SKX-NEXT: vpmovsqd %xmm0, %xmm0 +; SKX-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; SKX-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX-NEXT: retq %1 = icmp slt <2 x i64> %a0, %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> @@ -232,7 +244,11 @@ define void @trunc_ssat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) { ; AVX512F-LABEL: trunc_ssat_v2i64_v2i32_store: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpmovsqd %zmm0, %ymm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2147483647,2147483647] +; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512F-NEXT: vmovq %xmm0, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -245,7 +261,11 @@ define void @trunc_ssat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) { ; AVX512BW-LABEL: trunc_ssat_v2i64_v2i32_store: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovsqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2147483647,2147483647] +; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512BW-NEXT: vmovq %xmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -441,34 +461,44 @@ define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) { ; AVX512F-LABEL: trunc_ssat_v4i64_v4i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovsqd %zmm0, %ymm0 +; AVX512F-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_ssat_v4i64_v4i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsqd %ymm0, %xmm0 +; AVX512VL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_ssat_v4i64_v4i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovsqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_ssat_v4i64_v4i32: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovsqd %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_ssat_v4i64_v4i32: ; SKX: # %bb.0: -; SKX-NEXT: vpmovsqd %ymm0, %xmm0 +; SKX-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 +; SKX-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 +; SKX-NEXT: vpmovqd %ymm0, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %1 = icmp slt <4 x i64> %a0, @@ -741,16 +771,22 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="256" ; AVX512-LABEL: trunc_ssat_v8i64_v8i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vpmovsqd %zmm0, %ymm0 +; AVX512-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: retq ; ; SKX-LABEL: trunc_ssat_v8i64_v8i32: ; SKX: # %bb.0: -; SKX-NEXT: vmovdqa (%rdi), %ymm0 -; SKX-NEXT: vmovdqa 32(%rdi), %ymm1 -; SKX-NEXT: vpmovsqd %ymm0, %xmm0 -; SKX-NEXT: vpmovsqd %ymm1, %xmm1 -; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; SKX-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2147483647,2147483647,2147483647,2147483647] +; SKX-NEXT: vpminsq (%rdi), %ymm0, %ymm1 +; SKX-NEXT: vpminsq 32(%rdi), %ymm0, %ymm0 +; SKX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] +; SKX-NEXT: vpmaxsq %ymm2, %ymm0, %ymm0 +; SKX-NEXT: vpmaxsq %ymm2, %ymm1, %ymm1 +; SKX-NEXT: vpmovqd %ymm1, %xmm1 +; SKX-NEXT: vpmovqd %ymm0, %xmm0 +; SKX-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; SKX-NEXT: retq %a0 = load <8 x i64>, ptr %p0 %1 = icmp slt <8 x i64> %a0, @@ -865,30 +901,46 @@ define <2 x i16> @trunc_ssat_v2i64_v2i16(<2 x i64> %a0) { ; AVX512F-LABEL: trunc_ssat_v2i64_v2i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpmovsqw %zmm0, %xmm0 +; AVX512F-NEXT: vpmovsxwq {{.*#+}} xmm1 = [32767,32767] +; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_ssat_v2i64_v2i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsqw %xmm0, %xmm0 +; AVX512VL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmovqw %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_ssat_v2i64_v2i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovsqw %zmm0, %xmm0 +; AVX512BW-NEXT: vpmovsxwq {{.*#+}} xmm1 = [32767,32767] +; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_ssat_v2i64_v2i16: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovsqw %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmovqw %xmm0, %xmm0 ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_ssat_v2i64_v2i16: ; SKX: # %bb.0: -; SKX-NEXT: vpmovsqw %xmm0, %xmm0 +; SKX-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; SKX-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; SKX-NEXT: vpmovqw %xmm0, %xmm0 ; SKX-NEXT: retq %1 = icmp slt <2 x i64> %a0, %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> @@ -1003,7 +1055,12 @@ define void @trunc_ssat_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) { ; AVX512F-LABEL: trunc_ssat_v2i64_v2i16_store: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpmovsqw %zmm0, %xmm0 +; AVX512F-NEXT: vpmovsxwq {{.*#+}} xmm1 = [32767,32767] +; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1016,7 +1073,12 @@ define void @trunc_ssat_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) { ; AVX512BW-LABEL: trunc_ssat_v2i64_v2i16_store: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovsqw %zmm0, %xmm0 +; AVX512BW-NEXT: vpmovsxwq {{.*#+}} xmm1 = [32767,32767] +; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1186,7 +1248,9 @@ define <4 x i16> @trunc_ssat_v4i64_v4i16(<4 x i64> %a0) { ; AVX512F-LABEL: trunc_ssat_v4i64_v4i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovsqw %zmm0, %xmm0 +; AVX512F-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1199,7 +1263,9 @@ define <4 x i16> @trunc_ssat_v4i64_v4i16(<4 x i64> %a0) { ; AVX512BW-LABEL: trunc_ssat_v4i64_v4i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovsqw %zmm0, %xmm0 +; AVX512BW-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -1372,7 +1438,9 @@ define void @trunc_ssat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; AVX512F-LABEL: trunc_ssat_v4i64_v4i16_store: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovsqw %zmm0, %xmm0 +; AVX512F-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1386,7 +1454,9 @@ define void @trunc_ssat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; AVX512BW-LABEL: trunc_ssat_v4i64_v4i16_store: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovsqw %zmm0, %xmm0 +; AVX512BW-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1675,7 +1745,9 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256" ; AVX512-LABEL: trunc_ssat_v8i64_v8i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vpmovsqw %zmm0, %xmm0 +; AVX512-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -1698,23 +1770,82 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256" } define <4 x i16> @trunc_ssat_v4i32_v4i16(<4 x i32> %a0) { -; SSE-LABEL: trunc_ssat_v4i32_v4i16: -; SSE: # %bb.0: -; SSE-NEXT: packssdw %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: trunc_ssat_v4i32_v4i16: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767] +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: packssdw %xmm0, %xmm0 +; SSE2-SSSE3-NEXT: retq ; -; AVX-LABEL: trunc_ssat_v4i32_v4i16: -; AVX: # %bb.0: -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE41-LABEL: trunc_ssat_v4i32_v4i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: pmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: packssdw %xmm0, %xmm0 +; SSE41-NEXT: retq ; -; AVX512-LABEL: trunc_ssat_v4i32_v4i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX1-LABEL: trunc_ssat_v4i32_v4i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v4i32_v4i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767] +; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] +; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_ssat_v4i32_v4i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767] +; AVX512F-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] +; AVX512F-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_ssat_v4i32_v4i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VL-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_ssat_v4i32_v4i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767] +; AVX512BW-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] +; AVX512BW-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_ssat_v4i32_v4i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_ssat_v4i32_v4i16: ; SKX: # %bb.0: +; SKX-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; SKX-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; SKX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; SKX-NEXT: retq %1 = icmp slt <4 x i32> %a0, @@ -1726,20 +1857,56 @@ define <4 x i16> @trunc_ssat_v4i32_v4i16(<4 x i32> %a0) { } define void @trunc_ssat_v4i32_v4i16_store(<4 x i32> %a0, ptr%p1) { -; SSE-LABEL: trunc_ssat_v4i32_v4i16_store: -; SSE: # %bb.0: -; SSE-NEXT: packssdw %xmm0, %xmm0 -; SSE-NEXT: movq %xmm0, (%rdi) -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: trunc_ssat_v4i32_v4i16_store: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767] +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: por %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [4294934528,4294934528,4294934528,4294934528] +; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-SSSE3-NEXT: pand %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: pandn %xmm0, %xmm1 +; SSE2-SSSE3-NEXT: por %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm1 +; SSE2-SSSE3-NEXT: movq %xmm1, (%rdi) +; SSE2-SSSE3-NEXT: retq ; -; AVX-LABEL: trunc_ssat_v4i32_v4i16_store: -; AVX: # %bb.0: -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, (%rdi) -; AVX-NEXT: retq +; SSE41-LABEL: trunc_ssat_v4i32_v4i16_store: +; SSE41: # %bb.0: +; SSE41-NEXT: pminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: pmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: packssdw %xmm0, %xmm0 +; SSE41-NEXT: movq %xmm0, (%rdi) +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_ssat_v4i32_v4i16_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v4i32_v4i16_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767] +; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] +; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_ssat_v4i32_v4i16_store: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767] +; AVX512F-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] +; AVX512F-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, (%rdi) ; AVX512F-NEXT: retq @@ -1751,6 +1918,10 @@ define void @trunc_ssat_v4i32_v4i16_store(<4 x i32> %a0, ptr%p1) { ; ; AVX512BW-LABEL: trunc_ssat_v4i32_v4i16_store: ; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767] +; AVX512BW-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] +; AVX512BW-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, (%rdi) ; AVX512BW-NEXT: retq @@ -1774,20 +1945,63 @@ define void @trunc_ssat_v4i32_v4i16_store(<4 x i32> %a0, ptr%p1) { } define <8 x i16> @trunc_ssat_v8i32_v8i16(<8 x i32> %a0) { -; SSE-LABEL: trunc_ssat_v8i32_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: trunc_ssat_v8i32_v8i16: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [32767,32767,32767,32767] +; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: por %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: packssdw %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_ssat_v8i32_v8i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxwd {{.*#+}} xmm2 = [32767,32767,32767,32767] +; SSE41-NEXT: pminsd %xmm2, %xmm0 +; SSE41-NEXT: pminsd %xmm2, %xmm1 +; SSE41-NEXT: pmovsxwd {{.*#+}} xmm2 = [4294934528,4294934528,4294934528,4294934528] +; SSE41-NEXT: pmaxsd %xmm2, %xmm1 +; SSE41-NEXT: pmaxsd %xmm2, %xmm0 +; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_ssat_v8i32_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [32767,32767,32767,32767] +; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] +; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_ssat_v8i32_v8i16: ; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528] +; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -1795,33 +2009,47 @@ define <8 x i16> @trunc_ssat_v8i32_v8i16(<8 x i32> %a0) { ; ; AVX512F-LABEL: trunc_ssat_v8i32_v8i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528] +; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_ssat_v8i32_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsdw %ymm0, %xmm0 +; AVX512VL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_ssat_v8i32_v8i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528] +; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_ssat_v8i32_v8i16: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovsdw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_ssat_v8i32_v8i16: ; SKX: # %bb.0: -; SKX-NEXT: vpmovsdw %ymm0, %xmm0 +; SKX-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 +; SKX-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 +; SKX-NEXT: vpmovdw %ymm0, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %1 = icmp slt <8 x i32> %a0, @@ -1833,40 +2061,123 @@ define <8 x i16> @trunc_ssat_v8i32_v8i16(<8 x i32> %a0) { } define <16 x i16> @trunc_ssat_v16i32_v16i16(ptr %p0) "min-legal-vector-width"="256" { -; SSE-LABEL: trunc_ssat_v16i32_v16i16: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 32(%rdi), %xmm1 -; SSE-NEXT: packssdw 16(%rdi), %xmm0 -; SSE-NEXT: packssdw 48(%rdi), %xmm1 -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: trunc_ssat_v16i32_v16i16: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm0 +; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm3 +; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm4 +; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm1 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [32767,32767,32767,32767] +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm2 +; SSE2-SSSE3-NEXT: por %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: pand %xmm0, %xmm3 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm4 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: por %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm4 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm4 +; SSE2-SSSE3-NEXT: por %xmm1, %xmm4 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [4294934528,4294934528,4294934528,4294934528] +; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm4 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: por %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm1 +; SSE2-SSSE3-NEXT: pand %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm1 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: packssdw %xmm6, %xmm1 +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: por %xmm0, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_ssat_v16i32_v16i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxwd {{.*#+}} xmm2 = [32767,32767,32767,32767] +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: pminsd %xmm2, %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm3 +; SSE41-NEXT: pminsd %xmm2, %xmm3 +; SSE41-NEXT: movdqa 32(%rdi), %xmm1 +; SSE41-NEXT: pminsd %xmm2, %xmm1 +; SSE41-NEXT: pminsd 48(%rdi), %xmm2 +; SSE41-NEXT: pmovsxwd {{.*#+}} xmm4 = [4294934528,4294934528,4294934528,4294934528] +; SSE41-NEXT: pmaxsd %xmm4, %xmm2 +; SSE41-NEXT: pmaxsd %xmm4, %xmm1 +; SSE41-NEXT: packssdw %xmm2, %xmm1 +; SSE41-NEXT: pmaxsd %xmm4, %xmm3 +; SSE41-NEXT: pmaxsd %xmm4, %xmm0 +; SSE41-NEXT: packssdw %xmm3, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_ssat_v16i32_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX1-NEXT: vpackssdw 48(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw 16(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [32767,32767,32767,32767] +; AVX1-NEXT: vpminsd 32(%rdi), %xmm0, %xmm1 +; AVX1-NEXT: vpminsd 48(%rdi), %xmm0, %xmm2 +; AVX1-NEXT: vpminsd (%rdi), %xmm0, %xmm3 +; AVX1-NEXT: vpminsd 16(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [4294934528,4294934528,4294934528,4294934528] +; AVX1-NEXT: vpmaxsd %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpackssdw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpmaxsd %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmaxsd %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_ssat_v16i32_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpackssdw 32(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm0 = [32767,32767,32767,32767,32767,32767,32767,32767] +; AVX2-NEXT: vpminsd (%rdi), %ymm0, %ymm1 +; AVX2-NEXT: vpminsd 32(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528] +; AVX2-NEXT: vpmaxsd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmaxsd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpackssdw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_ssat_v16i32_v16i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vpmovsdw %zmm0, %ymm0 +; AVX512-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: retq ; ; SKX-LABEL: trunc_ssat_v16i32_v16i16: ; SKX: # %bb.0: -; SKX-NEXT: vmovdqa (%rdi), %ymm0 -; SKX-NEXT: vpackssdw 32(%rdi), %ymm0, %ymm0 +; SKX-NEXT: vpbroadcastd {{.*#+}} ymm0 = [32767,32767,32767,32767,32767,32767,32767,32767] +; SKX-NEXT: vpminsd (%rdi), %ymm0, %ymm1 +; SKX-NEXT: vpminsd 32(%rdi), %ymm0, %ymm0 +; SKX-NEXT: vpbroadcastd {{.*#+}} ymm2 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528] +; SKX-NEXT: vpmaxsd %ymm2, %ymm0, %ymm0 +; SKX-NEXT: vpmaxsd %ymm2, %ymm1, %ymm1 +; SKX-NEXT: vpackssdw %ymm0, %ymm1, %ymm0 ; SKX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; SKX-NEXT: retq %a0 = load <16 x i32>, ptr %p0 @@ -1991,30 +2302,48 @@ define <2 x i8> @trunc_ssat_v2i64_v2i8(<2 x i64> %a0) { ; AVX512F-LABEL: trunc_ssat_v2i64_v2i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpmovsqb %zmm0, %xmm0 +; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm1 = [127,127] +; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_ssat_v2i64_v2i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsqb %xmm0, %xmm0 +; AVX512VL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmovqb %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_ssat_v2i64_v2i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovsqb %zmm0, %xmm0 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm1 = [127,127] +; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_ssat_v2i64_v2i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovsqb %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmovqb %xmm0, %xmm0 ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_ssat_v2i64_v2i8: ; SKX: # %bb.0: -; SKX-NEXT: vpmovsqb %xmm0, %xmm0 +; SKX-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; SKX-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; SKX-NEXT: vpmovqb %xmm0, %xmm0 ; SKX-NEXT: retq %1 = icmp slt <2 x i64> %a0, %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> @@ -2138,7 +2467,13 @@ define void @trunc_ssat_v2i64_v2i8_store(<2 x i64> %a0, ptr%p1) { ; AVX512F-LABEL: trunc_ssat_v2i64_v2i8_store: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpmovsqb %zmm0, %xmm0 +; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm1 = [127,127] +; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2151,7 +2486,13 @@ define void @trunc_ssat_v2i64_v2i8_store(<2 x i64> %a0, ptr%p1) { ; AVX512BW-LABEL: trunc_ssat_v2i64_v2i8_store: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovsqb %zmm0, %xmm0 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm1 = [127,127] +; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2325,7 +2666,9 @@ define <4 x i8> @trunc_ssat_v4i64_v4i8(<4 x i64> %a0) { ; AVX512F-LABEL: trunc_ssat_v4i64_v4i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovsqb %zmm0, %xmm0 +; AVX512F-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2338,7 +2681,9 @@ define <4 x i8> @trunc_ssat_v4i64_v4i8(<4 x i64> %a0) { ; AVX512BW-LABEL: trunc_ssat_v4i64_v4i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovsqb %zmm0, %xmm0 +; AVX512BW-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -2515,7 +2860,9 @@ define void @trunc_ssat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; AVX512F-LABEL: trunc_ssat_v4i64_v4i8_store: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovsqb %zmm0, %xmm0 +; AVX512F-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2529,7 +2876,9 @@ define void @trunc_ssat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; AVX512BW-LABEL: trunc_ssat_v4i64_v4i8_store: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovsqb %zmm0, %xmm0 +; AVX512BW-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -3671,28 +4020,98 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256 } define <4 x i8> @trunc_ssat_v4i32_v4i8(<4 x i32> %a0) { -; SSE-LABEL: trunc_ssat_v4i32_v4i8: -; SSE: # %bb.0: -; SSE-NEXT: packssdw %xmm0, %xmm0 -; SSE-NEXT: packsswb %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: trunc_ssat_v4i32_v4i8: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127] +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168] +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: packssdw %xmm0, %xmm0 +; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0 +; SSE2-SSSE3-NEXT: retq ; -; AVX-LABEL: trunc_ssat_v4i32_v4i8: -; AVX: # %bb.0: -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE41-LABEL: trunc_ssat_v4i32_v4i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: pmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: packssdw %xmm0, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: retq ; -; AVX512-LABEL: trunc_ssat_v4i32_v4i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX1-LABEL: trunc_ssat_v4i32_v4i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_ssat_v4i32_v4i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127] +; AVX2-SLOW-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168] +; AVX2-SLOW-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_ssat_v4i32_v4i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127] +; AVX2-FAST-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168] +; AVX2-FAST-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: trunc_ssat_v4i32_v4i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127] +; AVX512F-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168] +; AVX512F-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_ssat_v4i32_v4i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmovdb %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_ssat_v4i32_v4i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127] +; AVX512BW-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168] +; AVX512BW-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_ssat_v4i32_v4i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmovdb %xmm0, %xmm0 +; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_ssat_v4i32_v4i8: ; SKX: # %bb.0: -; SKX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; SKX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; SKX-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; SKX-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; SKX-NEXT: vpmovdb %xmm0, %xmm0 ; SKX-NEXT: retq %1 = icmp slt <4 x i32> %a0, %2 = select <4 x i1> %1, <4 x i32> %a0, <4 x i32> @@ -3703,22 +4122,70 @@ define <4 x i8> @trunc_ssat_v4i32_v4i8(<4 x i32> %a0) { } define void @trunc_ssat_v4i32_v4i8_store(<4 x i32> %a0, ptr%p1) { -; SSE-LABEL: trunc_ssat_v4i32_v4i8_store: -; SSE: # %bb.0: -; SSE-NEXT: packssdw %xmm0, %xmm0 -; SSE-NEXT: packsswb %xmm0, %xmm0 -; SSE-NEXT: movd %xmm0, (%rdi) -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: trunc_ssat_v4i32_v4i8_store: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127] +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: por %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [4294967168,4294967168,4294967168,4294967168] +; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-SSSE3-NEXT: pand %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: pandn %xmm0, %xmm1 +; SSE2-SSSE3-NEXT: por %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm1 +; SSE2-SSSE3-NEXT: packsswb %xmm1, %xmm1 +; SSE2-SSSE3-NEXT: movd %xmm1, (%rdi) +; SSE2-SSSE3-NEXT: retq ; -; AVX-LABEL: trunc_ssat_v4i32_v4i8_store: -; AVX: # %bb.0: -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, (%rdi) -; AVX-NEXT: retq +; SSE41-LABEL: trunc_ssat_v4i32_v4i8_store: +; SSE41: # %bb.0: +; SSE41-NEXT: pminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: pmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: packssdw %xmm0, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: movd %xmm0, (%rdi) +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_ssat_v4i32_v4i8_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_ssat_v4i32_v4i8_store: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127] +; AVX2-SLOW-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168] +; AVX2-SLOW-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovd %xmm0, (%rdi) +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_ssat_v4i32_v4i8_store: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127] +; AVX2-FAST-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168] +; AVX2-FAST-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovd %xmm0, (%rdi) +; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: trunc_ssat_v4i32_v4i8_store: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127] +; AVX512F-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168] +; AVX512F-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, (%rdi) @@ -3731,6 +4198,10 @@ define void @trunc_ssat_v4i32_v4i8_store(<4 x i32> %a0, ptr%p1) { ; ; AVX512BW-LABEL: trunc_ssat_v4i32_v4i8_store: ; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127] +; AVX512BW-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168] +; AVX512BW-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, (%rdi) @@ -3755,22 +4226,66 @@ define void @trunc_ssat_v4i32_v4i8_store(<4 x i32> %a0, ptr%p1) { } define <8 x i8> @trunc_ssat_v8i32_v8i8(<8 x i32> %a0) { -; SSE-LABEL: trunc_ssat_v8i32_v8i8: -; SSE: # %bb.0: -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packsswb %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: trunc_ssat_v8i32_v8i8: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127] +; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: por %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168] +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: packssdw %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0 +; SSE2-SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_ssat_v8i32_v8i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [127,127,127,127] +; SSE41-NEXT: pminsd %xmm2, %xmm0 +; SSE41-NEXT: pminsd %xmm2, %xmm1 +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [4294967168,4294967168,4294967168,4294967168] +; SSE41-NEXT: pmaxsd %xmm2, %xmm1 +; SSE41-NEXT: pmaxsd %xmm2, %xmm0 +; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_ssat_v8i32_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [127,127,127,127] +; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168] +; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_ssat_v8i32_v8i8: ; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] +; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 @@ -3779,9 +4294,11 @@ define <8 x i8> @trunc_ssat_v8i32_v8i8(<8 x i32> %a0) { ; ; AVX512F-LABEL: trunc_ssat_v8i32_v8i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] +; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -3793,9 +4310,11 @@ define <8 x i8> @trunc_ssat_v8i32_v8i8(<8 x i32> %a0) { ; ; AVX512BW-LABEL: trunc_ssat_v8i32_v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127] +; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] +; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -3819,17 +4338,58 @@ define <8 x i8> @trunc_ssat_v8i32_v8i8(<8 x i32> %a0) { } define void @trunc_ssat_v8i32_v8i8_store(<8 x i32> %a0, ptr%p1) { -; SSE-LABEL: trunc_ssat_v8i32_v8i8_store: -; SSE: # %bb.0: -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packsswb %xmm0, %xmm0 -; SSE-NEXT: movq %xmm0, (%rdi) -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: trunc_ssat_v8i32_v8i8_store: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127] +; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: por %xmm0, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1 +; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: por %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168] +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: por %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: pand %xmm0, %xmm3 +; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: packssdw %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0 +; SSE2-SSSE3-NEXT: movq %xmm0, (%rdi) +; SSE2-SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_ssat_v8i32_v8i8_store: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [127,127,127,127] +; SSE41-NEXT: pminsd %xmm2, %xmm0 +; SSE41-NEXT: pminsd %xmm2, %xmm1 +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [4294967168,4294967168,4294967168,4294967168] +; SSE41-NEXT: pmaxsd %xmm2, %xmm1 +; SSE41-NEXT: pmaxsd %xmm2, %xmm0 +; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: movq %xmm0, (%rdi) +; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_ssat_v8i32_v8i8_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [127,127,127,127] +; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168] +; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, (%rdi) ; AVX1-NEXT: vzeroupper @@ -3837,6 +4397,10 @@ define void @trunc_ssat_v8i32_v8i8_store(<8 x i32> %a0, ptr%p1) { ; ; AVX2-LABEL: trunc_ssat_v8i32_v8i8_store: ; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] +; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 @@ -3846,9 +4410,11 @@ define void @trunc_ssat_v8i32_v8i8_store(<8 x i32> %a0, ptr%p1) { ; ; AVX512F-LABEL: trunc_ssat_v8i32_v8i8_store: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] +; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -3861,9 +4427,11 @@ define void @trunc_ssat_v8i32_v8i8_store(<8 x i32> %a0, ptr%p1) { ; ; AVX512BW-LABEL: trunc_ssat_v8i32_v8i8_store: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127] +; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] +; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -3889,28 +4457,105 @@ define void @trunc_ssat_v8i32_v8i8_store(<8 x i32> %a0, ptr%p1) { } define <16 x i8> @trunc_ssat_v16i32_v16i8(ptr %p0) "min-legal-vector-width"="256" { -; SSE-LABEL: trunc_ssat_v16i32_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 32(%rdi), %xmm1 -; SSE-NEXT: packssdw 48(%rdi), %xmm1 -; SSE-NEXT: packssdw 16(%rdi), %xmm0 -; SSE-NEXT: packsswb %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: trunc_ssat_v16i32_v16i8: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm3 +; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm0 +; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm4 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [127,127,127,127] +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: pand %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm1 +; SSE2-SSSE3-NEXT: por %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm2 +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm4 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm2 +; SSE2-SSSE3-NEXT: por %xmm4, %xmm2 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm4 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm4 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm4 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: por %xmm0, %xmm3 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [4294967168,4294967168,4294967168,4294967168] +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm3 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: pand %xmm0, %xmm4 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: por %xmm4, %xmm0 +; SSE2-SSSE3-NEXT: packssdw %xmm6, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: por %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm2 +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm2 +; SSE2-SSSE3-NEXT: por %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: packsswb %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_ssat_v16i32_v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [127,127,127,127] +; SSE41-NEXT: movdqa 32(%rdi), %xmm2 +; SSE41-NEXT: pminsd %xmm1, %xmm2 +; SSE41-NEXT: movdqa 48(%rdi), %xmm3 +; SSE41-NEXT: pminsd %xmm1, %xmm3 +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: pminsd %xmm1, %xmm0 +; SSE41-NEXT: pminsd 16(%rdi), %xmm1 +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm4 = [4294967168,4294967168,4294967168,4294967168] +; SSE41-NEXT: pmaxsd %xmm4, %xmm1 +; SSE41-NEXT: pmaxsd %xmm4, %xmm0 +; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: pmaxsd %xmm4, %xmm3 +; SSE41-NEXT: pmaxsd %xmm4, %xmm2 +; SSE41-NEXT: packssdw %xmm3, %xmm2 +; SSE41-NEXT: packsswb %xmm2, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_ssat_v16i32_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX1-NEXT: vpackssdw 48(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw 16(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [127,127,127,127] +; AVX1-NEXT: vpminsd 32(%rdi), %xmm0, %xmm1 +; AVX1-NEXT: vpminsd 48(%rdi), %xmm0, %xmm2 +; AVX1-NEXT: vpminsd (%rdi), %xmm0, %xmm3 +; AVX1-NEXT: vpminsd 16(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [4294967168,4294967168,4294967168,4294967168] +; AVX1-NEXT: vpmaxsd %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpackssdw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpmaxsd %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmaxsd %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_ssat_v16i32_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpackssdw 32(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm0 = [127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpminsd (%rdi), %ymm0, %ymm1 +; AVX2-NEXT: vpminsd 32(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] +; AVX2-NEXT: vpmaxsd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmaxsd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpackssdw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] @@ -3920,7 +4565,9 @@ define <16 x i8> @trunc_ssat_v16i32_v16i8(ptr %p0) "min-legal-vector-width"="256 ; AVX512-LABEL: trunc_ssat_v16i32_v16i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vpmovsdb %zmm0, %xmm0 +; AVX512-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -3943,30 +4590,108 @@ define <16 x i8> @trunc_ssat_v16i32_v16i8(ptr %p0) "min-legal-vector-width"="256 } define void @trunc_ssat_v16i32_v16i8_store(ptr %p0, ptr %p1) "min-legal-vector-width"="256" { -; SSE-LABEL: trunc_ssat_v16i32_v16i8_store: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 32(%rdi), %xmm1 -; SSE-NEXT: packssdw 48(%rdi), %xmm1 -; SSE-NEXT: packssdw 16(%rdi), %xmm0 -; SSE-NEXT: packsswb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsi) -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: trunc_ssat_v16i32_v16i8_store: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm3 +; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm2 +; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm1 +; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm4 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [127,127,127,127] +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: por %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE2-SSSE3-NEXT: pand %xmm1, %xmm4 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm1 +; SSE2-SSSE3-NEXT: por %xmm4, %xmm1 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm4 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm4 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm4 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: por %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4294967168,4294967168,4294967168,4294967168] +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm5 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm5 +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm5 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm5 +; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm4 +; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: por %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: packssdw %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 +; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm4 +; SSE2-SSSE3-NEXT: por %xmm1, %xmm4 +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 +; SSE2-SSSE3-NEXT: packssdw %xmm4, %xmm1 +; SSE2-SSSE3-NEXT: packsswb %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm3, (%rsi) +; SSE2-SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_ssat_v16i32_v16i8_store: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [127,127,127,127] +; SSE41-NEXT: movdqa 32(%rdi), %xmm1 +; SSE41-NEXT: pminsd %xmm0, %xmm1 +; SSE41-NEXT: movdqa 48(%rdi), %xmm2 +; SSE41-NEXT: pminsd %xmm0, %xmm2 +; SSE41-NEXT: movdqa (%rdi), %xmm3 +; SSE41-NEXT: pminsd %xmm0, %xmm3 +; SSE41-NEXT: pminsd 16(%rdi), %xmm0 +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm4 = [4294967168,4294967168,4294967168,4294967168] +; SSE41-NEXT: pmaxsd %xmm4, %xmm0 +; SSE41-NEXT: pmaxsd %xmm4, %xmm3 +; SSE41-NEXT: packssdw %xmm0, %xmm3 +; SSE41-NEXT: pmaxsd %xmm4, %xmm2 +; SSE41-NEXT: pmaxsd %xmm4, %xmm1 +; SSE41-NEXT: packssdw %xmm2, %xmm1 +; SSE41-NEXT: packsswb %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm3, (%rsi) +; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_ssat_v16i32_v16i8_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX1-NEXT: vpackssdw 48(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw 16(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [127,127,127,127] +; AVX1-NEXT: vpminsd 32(%rdi), %xmm0, %xmm1 +; AVX1-NEXT: vpminsd 48(%rdi), %xmm0, %xmm2 +; AVX1-NEXT: vpminsd (%rdi), %xmm0, %xmm3 +; AVX1-NEXT: vpminsd 16(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [4294967168,4294967168,4294967168,4294967168] +; AVX1-NEXT: vpmaxsd %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpackssdw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpmaxsd %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmaxsd %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, (%rsi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_ssat_v16i32_v16i8_store: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpackssdw 32(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm0 = [127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpminsd (%rdi), %ymm0, %ymm1 +; AVX2-NEXT: vpminsd 32(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] +; AVX2-NEXT: vpmaxsd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmaxsd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpackssdw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] @@ -4004,21 +4729,29 @@ define void @trunc_ssat_v16i32_v16i8_store(ptr %p0, ptr %p1) "min-legal-vector-w define <8 x i8> @trunc_ssat_v8i16_v8i8(<8 x i16> %a0) { ; SSE-LABEL: trunc_ssat_v8i16_v8i8: ; SSE: # %bb.0: +; SSE-NEXT: pminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: pmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: packsswb %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: trunc_ssat_v8i16_v8i8: ; AVX: # %bb.0: +; AVX-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: trunc_ssat_v8i16_v8i8: ; AVX512: # %bb.0: +; AVX512-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; SKX-LABEL: trunc_ssat_v8i16_v8i8: ; SKX: # %bb.0: +; SKX-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; SKX-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; SKX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; SKX-NEXT: retq %1 = icmp slt <8 x i16> %a0, @@ -4032,30 +4765,40 @@ define <8 x i8> @trunc_ssat_v8i16_v8i8(<8 x i16> %a0) { define void @trunc_ssat_v8i16_v8i8_store(<8 x i16> %a0, ptr%p1) { ; SSE-LABEL: trunc_ssat_v8i16_v8i8_store: ; SSE: # %bb.0: +; SSE-NEXT: pminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: pmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: packsswb %xmm0, %xmm0 ; SSE-NEXT: movq %xmm0, (%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: trunc_ssat_v8i16_v8i8_store: ; AVX: # %bb.0: +; AVX-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vmovq %xmm0, (%rdi) ; AVX-NEXT: retq ; ; AVX512F-LABEL: trunc_ssat_v8i16_v8i8_store: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, (%rdi) ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_ssat_v8i16_v8i8_store: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, (%rdi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_ssat_v8i16_v8i8_store: ; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, (%rdi) ; AVX512BW-NEXT: retq @@ -4079,20 +4822,45 @@ define void @trunc_ssat_v8i16_v8i8_store(<8 x i16> %a0, ptr%p1) { } define <16 x i8> @trunc_ssat_v16i16_v16i8(<16 x i16> %a0) { -; SSE-LABEL: trunc_ssat_v16i16_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: packsswb %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: trunc_ssat_v16i16_v16i8: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127] +; SSE2-SSSE3-NEXT: pminsw %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: pminsw %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [65408,65408,65408,65408,65408,65408,65408,65408] +; SSE2-SSSE3-NEXT: pmaxsw %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: pmaxsw %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: packsswb %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_ssat_v16i16_v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127] +; SSE41-NEXT: pminsw %xmm2, %xmm0 +; SSE41-NEXT: pminsw %xmm2, %xmm1 +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm2 = [65408,65408,65408,65408,65408,65408,65408,65408] +; SSE41-NEXT: pmaxsw %xmm2, %xmm1 +; SSE41-NEXT: pmaxsw %xmm2, %xmm0 +; SSE41-NEXT: packsswb %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_ssat_v16i16_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127] +; AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [65408,65408,65408,65408,65408,65408,65408,65408] +; AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpacksswb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_ssat_v16i16_v16i8: ; AVX2: # %bb.0: +; AVX2-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -4100,34 +4868,44 @@ define <16 x i8> @trunc_ssat_v16i16_v16i8(<16 x i16> %a0) { ; ; AVX512F-LABEL: trunc_ssat_v16i16_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_ssat_v16i16_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_ssat_v16i16_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_ssat_v16i16_v16i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovswb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_ssat_v16i16_v16i8: ; SKX: # %bb.0: -; SKX-NEXT: vpmovswb %ymm0, %xmm0 +; SKX-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; SKX-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; SKX-NEXT: vpmovwb %ymm0, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %1 = icmp slt <16 x i16> %a0, @@ -4139,60 +4917,122 @@ define <16 x i8> @trunc_ssat_v16i16_v16i8(<16 x i16> %a0) { } define <32 x i8> @trunc_ssat_v32i16_v32i8(ptr %p0) "min-legal-vector-width"="256" { -; SSE-LABEL: trunc_ssat_v32i16_v32i8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 32(%rdi), %xmm1 -; SSE-NEXT: packsswb 16(%rdi), %xmm0 -; SSE-NEXT: packsswb 48(%rdi), %xmm1 -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: trunc_ssat_v32i16_v32i8: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127] +; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm0 +; SSE2-SSSE3-NEXT: pminsw %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm3 +; SSE2-SSSE3-NEXT: pminsw %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm1 +; SSE2-SSSE3-NEXT: pminsw %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: pminsw 48(%rdi), %xmm2 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65408,65408,65408,65408,65408,65408,65408,65408] +; SSE2-SSSE3-NEXT: pmaxsw %xmm4, %xmm2 +; SSE2-SSSE3-NEXT: pmaxsw %xmm4, %xmm1 +; SSE2-SSSE3-NEXT: packsswb %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: pmaxsw %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: pmaxsw %xmm4, %xmm0 +; SSE2-SSSE3-NEXT: packsswb %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_ssat_v32i16_v32i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127] +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: pminsw %xmm2, %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm3 +; SSE41-NEXT: pminsw %xmm2, %xmm3 +; SSE41-NEXT: movdqa 32(%rdi), %xmm1 +; SSE41-NEXT: pminsw %xmm2, %xmm1 +; SSE41-NEXT: pminsw 48(%rdi), %xmm2 +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm4 = [65408,65408,65408,65408,65408,65408,65408,65408] +; SSE41-NEXT: pmaxsw %xmm4, %xmm2 +; SSE41-NEXT: pmaxsw %xmm4, %xmm1 +; SSE41-NEXT: packsswb %xmm2, %xmm1 +; SSE41-NEXT: pmaxsw %xmm4, %xmm3 +; SSE41-NEXT: pmaxsw %xmm4, %xmm0 +; SSE41-NEXT: packsswb %xmm3, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_ssat_v32i16_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX1-NEXT: vpacksswb 48(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb 16(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [127,127,127,127,127,127,127,127] +; AVX1-NEXT: vpminsw 32(%rdi), %xmm0, %xmm1 +; AVX1-NEXT: vpminsw 48(%rdi), %xmm0, %xmm2 +; AVX1-NEXT: vpminsw (%rdi), %xmm0, %xmm3 +; AVX1-NEXT: vpminsw 16(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [65408,65408,65408,65408,65408,65408,65408,65408] +; AVX1-NEXT: vpmaxsw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpacksswb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpmaxsw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmaxsw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_ssat_v32i16_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpacksswb 32(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm0 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpminsw (%rdi), %ymm0, %ymm1 +; AVX2-NEXT: vpminsw 32(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408] +; AVX2-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpacksswb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_ssat_v32i16_v32i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpacksswb 32(%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm0 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpminsw (%rdi), %ymm0, %ymm1 +; AVX512F-NEXT: vpminsw 32(%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408] +; AVX512F-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpacksswb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_ssat_v32i16_v32i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vpacksswb 32(%rdi), %ymm0, %ymm0 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm0 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-NEXT: vpminsw (%rdi), %ymm0, %ymm1 +; AVX512VL-NEXT: vpminsw 32(%rdi), %ymm0, %ymm0 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408] +; AVX512VL-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpacksswb %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_ssat_v32i16_v32i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpmovswb %zmm0, %ymm0 +; AVX512BW-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_ssat_v32i16_v32i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vpmovswb %zmm0, %ymm0 +; AVX512BWVL-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_ssat_v32i16_v32i8: ; SKX: # %bb.0: -; SKX-NEXT: vmovdqa (%rdi), %ymm0 -; SKX-NEXT: vpacksswb 32(%rdi), %ymm0, %ymm0 +; SKX-NEXT: vpbroadcastd {{.*#+}} ymm0 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; SKX-NEXT: vpminsw (%rdi), %ymm0, %ymm1 +; SKX-NEXT: vpminsw 32(%rdi), %ymm0, %ymm0 +; SKX-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408] +; SKX-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0 +; SKX-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1 +; SKX-NEXT: vpacksswb %ymm0, %ymm1, %ymm0 ; SKX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; SKX-NEXT: retq %a0 = load <32 x i16>, ptr %p0 @@ -4205,42 +5045,185 @@ define <32 x i8> @trunc_ssat_v32i16_v32i8(ptr %p0) "min-legal-vector-width"="256 } define <32 x i8> @trunc_ssat_v32i32_v32i8(ptr %p0) "min-legal-vector-width"="256" { -; SSE-LABEL: trunc_ssat_v32i32_v32i8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: movdqa 64(%rdi), %xmm1 -; SSE-NEXT: movdqa 96(%rdi), %xmm3 -; SSE-NEXT: packssdw 48(%rdi), %xmm2 -; SSE-NEXT: packssdw 16(%rdi), %xmm0 -; SSE-NEXT: packsswb %xmm2, %xmm0 -; SSE-NEXT: packssdw 112(%rdi), %xmm3 -; SSE-NEXT: packssdw 80(%rdi), %xmm1 -; SSE-NEXT: packsswb %xmm3, %xmm1 -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: trunc_ssat_v32i32_v32i8: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: movdqa 80(%rdi), %xmm1 +; SSE2-SSSE3-NEXT: movdqa 64(%rdi), %xmm5 +; SSE2-SSSE3-NEXT: movdqa 112(%rdi), %xmm8 +; SSE2-SSSE3-NEXT: movdqa 96(%rdi), %xmm9 +; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm0 +; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm6 +; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm3 +; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm4 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [127,127,127,127] +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pandn %xmm7, %xmm2 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm4 +; SSE2-SSSE3-NEXT: pandn %xmm7, %xmm3 +; SSE2-SSSE3-NEXT: por %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm4 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm7, %xmm4 +; SSE2-SSSE3-NEXT: por %xmm0, %xmm4 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm0 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm0 +; SSE2-SSSE3-NEXT: pand %xmm0, %xmm6 +; SSE2-SSSE3-NEXT: pandn %xmm7, %xmm0 +; SSE2-SSSE3-NEXT: por %xmm6, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm9, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm9 +; SSE2-SSSE3-NEXT: pandn %xmm7, %xmm6 +; SSE2-SSSE3-NEXT: por %xmm9, %xmm6 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm9 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm8 +; SSE2-SSSE3-NEXT: pandn %xmm7, %xmm9 +; SSE2-SSSE3-NEXT: por %xmm8, %xmm9 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-SSSE3-NEXT: pand %xmm8, %xmm5 +; SSE2-SSSE3-NEXT: pandn %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: por %xmm5, %xmm8 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm10 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm10 +; SSE2-SSSE3-NEXT: pand %xmm10, %xmm1 +; SSE2-SSSE3-NEXT: pandn %xmm7, %xmm10 +; SSE2-SSSE3-NEXT: por %xmm1, %xmm10 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [4294967168,4294967168,4294967168,4294967168] +; SSE2-SSSE3-NEXT: movdqa %xmm10, %xmm7 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm10 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: por %xmm10, %xmm7 +; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm1 +; SSE2-SSSE3-NEXT: pand %xmm1, %xmm8 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm1 +; SSE2-SSSE3-NEXT: por %xmm8, %xmm1 +; SSE2-SSSE3-NEXT: packssdw %xmm7, %xmm1 +; SSE2-SSSE3-NEXT: movdqa %xmm9, %xmm7 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm9 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: por %xmm9, %xmm7 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-SSSE3-NEXT: pand %xmm8, %xmm6 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm8 +; SSE2-SSSE3-NEXT: por %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: packssdw %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: packsswb %xmm8, %xmm1 +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: por %xmm0, %xmm6 +; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: pand %xmm0, %xmm4 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: por %xmm4, %xmm0 +; SSE2-SSSE3-NEXT: packssdw %xmm6, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm4 +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm4 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm4 +; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: por %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: packssdw %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: packsswb %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_ssat_v32i32_v32i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm5 = [127,127,127,127] +; SSE41-NEXT: movdqa 32(%rdi), %xmm2 +; SSE41-NEXT: pminsd %xmm5, %xmm2 +; SSE41-NEXT: movdqa 48(%rdi), %xmm3 +; SSE41-NEXT: pminsd %xmm5, %xmm3 +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: pminsd %xmm5, %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm4 +; SSE41-NEXT: pminsd %xmm5, %xmm4 +; SSE41-NEXT: movdqa 96(%rdi), %xmm6 +; SSE41-NEXT: pminsd %xmm5, %xmm6 +; SSE41-NEXT: movdqa 112(%rdi), %xmm7 +; SSE41-NEXT: pminsd %xmm5, %xmm7 +; SSE41-NEXT: movdqa 64(%rdi), %xmm1 +; SSE41-NEXT: pminsd %xmm5, %xmm1 +; SSE41-NEXT: pminsd 80(%rdi), %xmm5 +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm8 = [4294967168,4294967168,4294967168,4294967168] +; SSE41-NEXT: pmaxsd %xmm8, %xmm5 +; SSE41-NEXT: pmaxsd %xmm8, %xmm1 +; SSE41-NEXT: packssdw %xmm5, %xmm1 +; SSE41-NEXT: pmaxsd %xmm8, %xmm7 +; SSE41-NEXT: pmaxsd %xmm8, %xmm6 +; SSE41-NEXT: packssdw %xmm7, %xmm6 +; SSE41-NEXT: packsswb %xmm6, %xmm1 +; SSE41-NEXT: pmaxsd %xmm8, %xmm4 +; SSE41-NEXT: pmaxsd %xmm8, %xmm0 +; SSE41-NEXT: packssdw %xmm4, %xmm0 +; SSE41-NEXT: pmaxsd %xmm8, %xmm3 +; SSE41-NEXT: pmaxsd %xmm8, %xmm2 +; SSE41-NEXT: packssdw %xmm3, %xmm2 +; SSE41-NEXT: packsswb %xmm2, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_ssat_v32i32_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa 96(%rdi), %xmm3 -; AVX1-NEXT: vpackssdw 112(%rdi), %xmm3, %xmm3 -; AVX1-NEXT: vpackssdw 80(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vpacksswb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpackssdw 48(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw 16(%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [127,127,127,127] +; AVX1-NEXT: vpminsd 96(%rdi), %xmm0, %xmm1 +; AVX1-NEXT: vpminsd 112(%rdi), %xmm0, %xmm2 +; AVX1-NEXT: vpminsd 64(%rdi), %xmm0, %xmm3 +; AVX1-NEXT: vpminsd 80(%rdi), %xmm0, %xmm4 +; AVX1-NEXT: vpminsd 32(%rdi), %xmm0, %xmm5 +; AVX1-NEXT: vpminsd 48(%rdi), %xmm0, %xmm6 +; AVX1-NEXT: vpminsd (%rdi), %xmm0, %xmm7 +; AVX1-NEXT: vpminsd 16(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [4294967168,4294967168,4294967168,4294967168] +; AVX1-NEXT: vpmaxsd %xmm8, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsd %xmm8, %xmm7, %xmm7 +; AVX1-NEXT: vpackssdw %xmm0, %xmm7, %xmm0 +; AVX1-NEXT: vpmaxsd %xmm8, %xmm6, %xmm6 +; AVX1-NEXT: vpmaxsd %xmm8, %xmm5, %xmm5 +; AVX1-NEXT: vpackssdw %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpacksswb %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsd %xmm8, %xmm4, %xmm4 +; AVX1-NEXT: vpmaxsd %xmm8, %xmm3, %xmm3 +; AVX1-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpmaxsd %xmm8, %xmm2, %xmm2 +; AVX1-NEXT: vpmaxsd %xmm8, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpacksswb %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_ssat_v32i32_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX2-NEXT: vpackssdw 96(%rdi), %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm0 = [127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpminsd 64(%rdi), %ymm0, %ymm1 +; AVX2-NEXT: vpminsd 96(%rdi), %ymm0, %ymm2 +; AVX2-NEXT: vpminsd (%rdi), %ymm0, %ymm3 +; AVX2-NEXT: vpminsd 32(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] +; AVX2-NEXT: vpmaxsd %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpmaxsd %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpackssdw %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpmaxsd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpmaxsd %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpackssdw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] -; AVX2-NEXT: vpackssdw 32(%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] @@ -4248,23 +5231,30 @@ define <32 x i8> @trunc_ssat_v32i32_v32i8(ptr %p0) "min-legal-vector-width"="256 ; ; AVX512-LABEL: trunc_ssat_v32i32_v32i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-NEXT: vpmovsdb %zmm0, %xmm0 -; AVX512-NEXT: vpmovsdb %zmm1, %xmm1 -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm0 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512-NEXT: vpminsd (%rdi), %zmm0, %zmm1 +; AVX512-NEXT: vpminsd 64(%rdi), %zmm0, %zmm0 +; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm2 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] +; AVX512-NEXT: vpmaxsd %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpmaxsd %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512-NEXT: retq ; ; SKX-LABEL: trunc_ssat_v32i32_v32i8: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa (%rdi), %ymm0 -; SKX-NEXT: vmovdqa 64(%rdi), %ymm1 -; SKX-NEXT: vpackssdw 96(%rdi), %ymm1, %ymm1 -; SKX-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] -; SKX-NEXT: vpackssdw 32(%rdi), %ymm0, %ymm0 -; SKX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; SKX-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 -; SKX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; SKX-NEXT: vmovdqa 32(%rdi), %ymm1 +; SKX-NEXT: vmovdqa 64(%rdi), %ymm2 +; SKX-NEXT: vmovdqa 96(%rdi), %ymm3 +; SKX-NEXT: vpmovsdb %ymm1, %xmm1 +; SKX-NEXT: vpmovsdb %ymm3, %xmm3 +; SKX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; SKX-NEXT: vpmovsdb %ymm0, %xmm0 +; SKX-NEXT: vpmovsdb %ymm2, %xmm2 +; SKX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; SKX-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; SKX-NEXT: retq %a0 = load <32 x i32>, ptr %p0 %1 = icmp slt <32 x i32> %a0, diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll index a5d83a86f295e..5ba2f628cf4aa 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -61,32 +61,37 @@ define <2 x i32> @trunc_usat_v2i64_v2i32(<2 x i64> %a0) { ; AVX512F-LABEL: trunc_usat_v2i64_v2i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpmovusqd %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4294967295,0,4294967295,0] +; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_usat_v2i64_v2i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovusqd %xmm0, %xmm0 +; AVX512VL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_usat_v2i64_v2i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovusqd %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4294967295,0,4294967295,0] +; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_usat_v2i64_v2i32: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovusqd %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_usat_v2i64_v2i32: ; SKX: # %bb.0: -; SKX-NEXT: vpmovusqd %xmm0, %xmm0 +; SKX-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX-NEXT: retq %1 = icmp ult <2 x i64> %a0, %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> @@ -139,7 +144,9 @@ define void @trunc_usat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) { ; AVX512F-LABEL: trunc_usat_v2i64_v2i32_store: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpmovusqd %zmm0, %ymm0 +; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4294967295,0,4294967295,0] +; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512F-NEXT: vmovq %xmm0, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -152,7 +159,9 @@ define void @trunc_usat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) { ; AVX512BW-LABEL: trunc_usat_v2i64_v2i32_store: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovusqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4294967295,0,4294967295,0] +; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512BW-NEXT: vmovq %xmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -497,13 +506,15 @@ define <8 x i32> @trunc_usat_v8i64_v8i32(ptr %p0) { ; AVX512-LABEL: trunc_usat_v8i64_v8i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vpmovusqd %zmm0, %ymm0 +; AVX512-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: retq ; ; SKX-LABEL: trunc_usat_v8i64_v8i32: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 (%rdi), %zmm0 -; SKX-NEXT: vpmovusqd %zmm0, %ymm0 +; SKX-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; SKX-NEXT: vpmovqd %zmm0, %ymm0 ; SKX-NEXT: retq %a0 = load <8 x i64>, ptr %p0 %1 = icmp ult <8 x i64> %a0, @@ -578,30 +589,39 @@ define <2 x i16> @trunc_usat_v2i64_v2i16(<2 x i64> %a0) { ; AVX512F-LABEL: trunc_usat_v2i64_v2i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm1 = [65535,65535] +; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_usat_v2i64_v2i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovusqw %xmm0, %xmm0 +; AVX512VL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmovqw %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_usat_v2i64_v2i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = [65535,65535] +; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_usat_v2i64_v2i16: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovusqw %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmovqw %xmm0, %xmm0 ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_usat_v2i64_v2i16: ; SKX: # %bb.0: -; SKX-NEXT: vpmovusqw %xmm0, %xmm0 +; SKX-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; SKX-NEXT: vpmovqw %xmm0, %xmm0 ; SKX-NEXT: retq %1 = icmp ult <2 x i64> %a0, %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> @@ -676,7 +696,10 @@ define void @trunc_usat_v2i64_v2i16_store(<2 x i64> %a0, ptr %p1) { ; AVX512F-LABEL: trunc_usat_v2i64_v2i16_store: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm1 = [65535,65535] +; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -689,7 +712,10 @@ define void @trunc_usat_v2i64_v2i16_store(<2 x i64> %a0, ptr %p1) { ; AVX512BW-LABEL: trunc_usat_v2i64_v2i16_store: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = [65535,65535] +; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -809,7 +835,8 @@ define <4 x i16> @trunc_usat_v4i64_v4i16(<4 x i64> %a0) { ; AVX512F-LABEL: trunc_usat_v4i64_v4i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512F-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -822,7 +849,8 @@ define <4 x i16> @trunc_usat_v4i64_v4i16(<4 x i64> %a0) { ; AVX512BW-LABEL: trunc_usat_v4i64_v4i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512BW-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -944,7 +972,8 @@ define void @trunc_usat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; AVX512F-LABEL: trunc_usat_v4i64_v4i16_store: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512F-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -958,7 +987,8 @@ define void @trunc_usat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; AVX512BW-LABEL: trunc_usat_v4i64_v4i16_store: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512BW-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1145,14 +1175,16 @@ define <8 x i16> @trunc_usat_v8i64_v8i16(ptr %p0) { ; AVX512-LABEL: trunc_usat_v8i64_v8i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; SKX-LABEL: trunc_usat_v8i64_v8i16: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 (%rdi), %zmm0 -; SKX-NEXT: vpmovusqw %zmm0, %xmm0 +; SKX-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; SKX-NEXT: vpmovqw %zmm0, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %a0 = load <8 x i64>, ptr %p0 @@ -1211,33 +1243,34 @@ define <4 x i16> @trunc_usat_v4i32_v4i16(<4 x i32> %a0) { ; ; AVX512F-LABEL: trunc_usat_v4i32_v4i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpmovusdw %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535] +; AVX512F-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_usat_v4i32_v4i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovusdw %xmm0, %xmm0 +; AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VL-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_usat_v4i32_v4i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovusdw %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535] +; AVX512BW-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_usat_v4i32_v4i16: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovusdw %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_usat_v4i32_v4i16: ; SKX: # %bb.0: -; SKX-NEXT: vpmovusdw %xmm0, %xmm0 +; SKX-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; SKX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; SKX-NEXT: retq %1 = icmp ult <4 x i32> %a0, %2 = select <4 x i1> %1, <4 x i32> %a0, <4 x i32> @@ -1298,10 +1331,10 @@ define void @trunc_usat_v4i32_v4i16_store(<4 x i32> %a0, ptr%p1) { ; ; AVX512F-LABEL: trunc_usat_v4i32_v4i16_store: ; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpmovusdw %zmm0, %ymm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535] +; AVX512F-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, (%rdi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_usat_v4i32_v4i16_store: @@ -1311,10 +1344,10 @@ define void @trunc_usat_v4i32_v4i16_store(<4 x i32> %a0, ptr%p1) { ; ; AVX512BW-LABEL: trunc_usat_v4i32_v4i16_store: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovusdw %zmm0, %ymm0 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535] +; AVX512BW-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, (%rdi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_usat_v4i32_v4i16_store: @@ -1410,35 +1443,40 @@ define <8 x i16> @trunc_usat_v8i32_v8i16(<8 x i32> %a0) { ; ; AVX512F-LABEL: trunc_usat_v8i32_v8i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovusdw %zmm0, %ymm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_usat_v8i32_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovusdw %ymm0, %xmm0 +; AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_usat_v8i32_v8i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovusdw %zmm0, %ymm0 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512BW-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_usat_v8i32_v8i16: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovusdw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_usat_v8i32_v8i16: ; SKX: # %bb.0: -; SKX-NEXT: vpmovusdw %ymm0, %xmm0 +; SKX-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 +; SKX-NEXT: vpmovdw %ymm0, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %1 = icmp ult <8 x i32> %a0, @@ -1577,13 +1615,15 @@ define <16 x i16> @trunc_usat_v16i32_v16i16(ptr %p0) { ; AVX512-LABEL: trunc_usat_v16i32_v16i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vpmovusdw %zmm0, %ymm0 +; AVX512-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: retq ; ; SKX-LABEL: trunc_usat_v16i32_v16i16: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 (%rdi), %zmm0 -; SKX-NEXT: vpmovusdw %zmm0, %ymm0 +; SKX-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; SKX-NEXT: vpmovdw %zmm0, %ymm0 ; SKX-NEXT: retq %a0 = load <16 x i32>, ptr %p0 %1 = icmp ult <16 x i32> %a0, @@ -1656,30 +1696,41 @@ define <2 x i8> @trunc_usat_v2i64_v2i8(<2 x i64> %a0) { ; AVX512F-LABEL: trunc_usat_v2i64_v2i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,255] +; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_usat_v2i64_v2i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovusqb %xmm0, %xmm0 +; AVX512VL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmovqb %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_usat_v2i64_v2i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,255] +; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_usat_v2i64_v2i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovusqb %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmovqb %xmm0, %xmm0 ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_usat_v2i64_v2i8: ; SKX: # %bb.0: -; SKX-NEXT: vpmovusqb %xmm0, %xmm0 +; SKX-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; SKX-NEXT: vpmovqb %xmm0, %xmm0 ; SKX-NEXT: retq %1 = icmp ult <2 x i64> %a0, %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> @@ -1752,7 +1803,11 @@ define void @trunc_usat_v2i64_v2i8_store(<2 x i64> %a0, ptr %p1) { ; AVX512F-LABEL: trunc_usat_v2i64_v2i8_store: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,255] +; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1765,7 +1820,11 @@ define void @trunc_usat_v2i64_v2i8_store(<2 x i64> %a0, ptr %p1) { ; AVX512BW-LABEL: trunc_usat_v2i64_v2i8_store: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,255] +; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1886,7 +1945,8 @@ define <4 x i8> @trunc_usat_v4i64_v4i8(<4 x i64> %a0) { ; AVX512F-LABEL: trunc_usat_v4i64_v4i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512F-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1899,7 +1959,8 @@ define <4 x i8> @trunc_usat_v4i64_v4i8(<4 x i64> %a0) { ; AVX512BW-LABEL: trunc_usat_v4i64_v4i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512BW-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -2023,7 +2084,8 @@ define void @trunc_usat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; AVX512F-LABEL: trunc_usat_v4i64_v4i8_store: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512F-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2037,7 +2099,8 @@ define void @trunc_usat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; AVX512BW-LABEL: trunc_usat_v4i64_v4i8_store: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512BW-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2784,31 +2847,36 @@ define <4 x i8> @trunc_usat_v4i32_v4i8(<4 x i32> %a0) { ; ; AVX512F-LABEL: trunc_usat_v4i32_v4i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpmovusdb %zmm0, %xmm0 -; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255] +; AVX512F-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_usat_v4i32_v4i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovusdb %xmm0, %xmm0 +; AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmovdb %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_usat_v4i32_v4i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovusdb %zmm0, %xmm0 -; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255] +; AVX512BW-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_usat_v4i32_v4i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovusdb %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmovdb %xmm0, %xmm0 ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_usat_v4i32_v4i8: ; SKX: # %bb.0: -; SKX-NEXT: vpmovusdb %xmm0, %xmm0 +; SKX-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; SKX-NEXT: vpmovdb %xmm0, %xmm0 ; SKX-NEXT: retq %1 = icmp ult <4 x i32> %a0, %2 = select <4 x i1> %1, <4 x i32> %a0, <4 x i32> @@ -2866,10 +2934,11 @@ define void @trunc_usat_v4i32_v4i8_store(<4 x i32> %a0, ptr%p1) { ; ; AVX512F-LABEL: trunc_usat_v4i32_v4i8_store: ; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpmovusdb %zmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255] +; AVX512F-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, (%rdi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_usat_v4i32_v4i8_store: @@ -2879,10 +2948,10 @@ define void @trunc_usat_v4i32_v4i8_store(<4 x i32> %a0, ptr%p1) { ; ; AVX512BW-LABEL: trunc_usat_v4i32_v4i8_store: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovusdb %zmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255] +; AVX512BW-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovd %xmm0, (%rdi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_usat_v4i32_v4i8_store: @@ -2955,8 +3024,9 @@ define <8 x i8> @trunc_usat_v8i32_v8i8(<8 x i32> %a0) { ; ; AVX512F-LABEL: trunc_usat_v8i32_v8i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovusdb %zmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2968,8 +3038,9 @@ define <8 x i8> @trunc_usat_v8i32_v8i8(<8 x i32> %a0) { ; ; AVX512BW-LABEL: trunc_usat_v8i32_v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovusdb %zmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -3048,8 +3119,9 @@ define void @trunc_usat_v8i32_v8i8_store(<8 x i32> %a0, ptr%p1) { ; ; AVX512F-LABEL: trunc_usat_v8i32_v8i8_store: ; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovusdb %zmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -3062,8 +3134,9 @@ define void @trunc_usat_v8i32_v8i8_store(<8 x i32> %a0, ptr%p1) { ; ; AVX512BW-LABEL: trunc_usat_v8i32_v8i8_store: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovusdb %zmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -3169,14 +3242,16 @@ define <16 x i8> @trunc_usat_v16i32_v16i8(ptr %p0) { ; AVX512-LABEL: trunc_usat_v16i32_v16i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vpmovusdb %zmm0, %xmm0 +; AVX512-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; SKX-LABEL: trunc_usat_v16i32_v16i8: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 (%rdi), %zmm0 -; SKX-NEXT: vpmovusdb %zmm0, %xmm0 +; SKX-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; SKX-NEXT: vpmovdb %zmm0, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %a0 = load <16 x i32>, ptr %p0 @@ -3312,34 +3387,16 @@ define <8 x i8> @trunc_usat_v8i16_v8i8(<8 x i16> %a0) { ; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq ; -; AVX512F-LABEL: trunc_usat_v8i16_v8i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_usat_v8i16_v8i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_usat_v8i16_v8i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovuswb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_usat_v8i16_v8i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovuswb %xmm0, %xmm0 -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: trunc_usat_v8i16_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: retq ; ; SKX-LABEL: trunc_usat_v8i16_v8i8: ; SKX: # %bb.0: -; SKX-NEXT: vpmovuswb %xmm0, %xmm0 +; SKX-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; SKX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; SKX-NEXT: retq %1 = icmp ult <8 x i16> %a0, %2 = select <8 x i1> %1, <8 x i16> %a0, <8 x i16> @@ -3387,10 +3444,9 @@ define void @trunc_usat_v8i16_v8i8_store(<8 x i16> %a0, ptr%p1) { ; ; AVX512BW-LABEL: trunc_usat_v8i16_v8i8_store: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovuswb %zmm0, %ymm0 +; AVX512BW-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, (%rdi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_usat_v8i16_v8i8_store: @@ -3466,21 +3522,23 @@ define <16 x i8> @trunc_usat_v16i16_v16i8(<16 x i16> %a0) { ; ; AVX512BW-LABEL: trunc_usat_v16i16_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovuswb %zmm0, %ymm0 +; AVX512BW-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_usat_v16i16_v16i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovuswb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_usat_v16i16_v16i8: ; SKX: # %bb.0: -; SKX-NEXT: vpmovuswb %ymm0, %xmm0 +; SKX-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; SKX-NEXT: vpmovwb %ymm0, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %1 = icmp ult <16 x i16> %a0, @@ -3569,19 +3627,22 @@ define <32 x i8> @trunc_usat_v32i16_v32i8(ptr %p0) { ; AVX512BW-LABEL: trunc_usat_v32i16_v32i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpmovuswb %zmm0, %ymm0 +; AVX512BW-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_usat_v32i16_v32i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vpmovuswb %zmm0, %ymm0 +; AVX512BWVL-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_usat_v32i16_v32i8: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 (%rdi), %zmm0 -; SKX-NEXT: vpmovuswb %zmm0, %ymm0 +; SKX-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; SKX-NEXT: vpmovwb %zmm0, %ymm0 ; SKX-NEXT: retq %a0 = load <32 x i16>, ptr %p0 %1 = icmp ult <32 x i16> %a0, @@ -3729,19 +3790,21 @@ define <32 x i8> @trunc_usat_v32i32_v32i8(ptr %p0) { ; ; AVX512-LABEL: trunc_usat_v32i32_v32i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-NEXT: vpmovusdb %zmm0, %xmm0 -; AVX512-NEXT: vpmovusdb %zmm1, %xmm1 +; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512-NEXT: vpminud 64(%rdi), %zmm0, %zmm1 +; AVX512-NEXT: vpminud (%rdi), %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq ; ; SKX-LABEL: trunc_usat_v32i32_v32i8: ; SKX: # %bb.0: -; SKX-NEXT: vmovdqa64 (%rdi), %zmm0 -; SKX-NEXT: vmovdqa64 64(%rdi), %zmm1 -; SKX-NEXT: vpmovusdb %zmm0, %xmm0 -; SKX-NEXT: vpmovusdb %zmm1, %xmm1 +; SKX-NEXT: vpbroadcastd {{.*#+}} zmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; SKX-NEXT: vpminud 64(%rdi), %zmm0, %zmm1 +; SKX-NEXT: vpminud (%rdi), %zmm0, %zmm0 +; SKX-NEXT: vpmovdb %zmm0, %xmm0 +; SKX-NEXT: vpmovdb %zmm1, %xmm1 ; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; SKX-NEXT: retq %a0 = load <32 x i32>, ptr %p0