-
Notifications
You must be signed in to change notification settings - Fork 14.5k
[AMDGPU] Add freeze for LowerSELECT #148796
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-backend-amdgpu Author: None (Shoreshen) ChangesTrying to solve #147635 Add freeze for legalizer when breaking i64 select to 2 i32 select. Several tests changed, still need to investigate why. Patch is 109.76 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/148796.diff 14 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 6cf2055c8e565..15b0d547bee62 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11071,11 +11071,17 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
VT.getSizeInBits() == 512)
return splitTernaryVectorOp(Op, DAG);
- assert(VT.getSizeInBits() == 64);
SDLoc DL(Op);
SDValue Cond = Op.getOperand(0);
-
+ if (Cond.getOpcode() == ISD::SETCC) {
+ SDValue Freeze = DAG.getFreeze(Cond.getOperand(0));
+ if (Freeze != Cond.getOperand(0)) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+ Cond =
+ DAG.getSetCC(DL, Cond.getValueType(), Freeze, Cond.getOperand(1), CC);
+ }
+ }
SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
SDValue One = DAG.getConstant(1, DL, MVT::i32);
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index 77b78f1f8a333..43128db05a597 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -6,77 +6,77 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-LABEL: v_sdiv_v2i128_vv:
; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 0, v0
-; SDAG-NEXT: v_mov_b32_e32 v18, 0
+; SDAG-NEXT: v_sub_i32_e32 v18, vcc, 0, v0
+; SDAG-NEXT: v_mov_b32_e32 v19, 0
; SDAG-NEXT: v_ashrrev_i32_e32 v24, 31, v3
; SDAG-NEXT: v_ashrrev_i32_e32 v25, 31, v11
-; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f
-; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc
+; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f
+; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v1, vcc
; SDAG-NEXT: v_mov_b32_e32 v26, v24
; SDAG-NEXT: v_mov_b32_e32 v27, v25
-; SDAG-NEXT: v_subb_u32_e32 v19, vcc, 0, v2, vcc
-; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
-; SDAG-NEXT: v_cndmask_b32_e64 v21, v1, v17, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v20, v0, v16, s[4:5]
-; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v3, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v16, v2, v19, s[4:5]
-; SDAG-NEXT: v_ffbh_u32_e32 v1, v20
-; SDAG-NEXT: v_ffbh_u32_e32 v2, v21
-; SDAG-NEXT: v_cndmask_b32_e64 v17, v3, v0, s[4:5]
-; SDAG-NEXT: v_or_b32_e32 v0, v20, v16
-; SDAG-NEXT: v_sub_i32_e32 v3, vcc, 0, v8
-; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], 32, v1
-; SDAG-NEXT: v_ffbh_u32_e32 v22, v16
+; SDAG-NEXT: v_subb_u32_e32 v16, vcc, 0, v2, vcc
+; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v3, vcc
+; SDAG-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT: v_cndmask_b32_e32 v17, v3, v17, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v16, v2, v16, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v21, v1, v20, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v20, v0, v18, vcc
+; SDAG-NEXT: v_sub_i32_e32 v2, vcc, 0, v8
; SDAG-NEXT: v_or_b32_e32 v1, v21, v17
-; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v9, vcc
-; SDAG-NEXT: v_min_u32_e32 v2, v19, v2
-; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], 32, v22
+; SDAG-NEXT: v_or_b32_e32 v0, v20, v16
+; SDAG-NEXT: v_subb_u32_e32 v3, vcc, 0, v9, vcc
+; SDAG-NEXT: v_ffbh_u32_e32 v18, v16
; SDAG-NEXT: v_ffbh_u32_e32 v22, v17
+; SDAG-NEXT: v_ffbh_u32_e32 v23, v20
+; SDAG-NEXT: v_ffbh_u32_e32 v28, v21
; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
-; SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[10:11]
-; SDAG-NEXT: v_cndmask_b32_e64 v28, v9, v23, s[6:7]
; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v10, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v29, v8, v3, s[6:7]
-; SDAG-NEXT: v_min_u32_e32 v1, v19, v22
-; SDAG-NEXT: v_add_i32_e64 v2, s[8:9], 64, v2
-; SDAG-NEXT: v_addc_u32_e64 v3, s[8:9], 0, 0, s[8:9]
-; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v11, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v0, v10, v0, s[6:7]
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
-; SDAG-NEXT: v_cndmask_b32_e64 v9, v3, 0, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v10, v2, v1, vcc
-; SDAG-NEXT: v_ffbh_u32_e32 v3, v29
-; SDAG-NEXT: v_ffbh_u32_e32 v19, v28
-; SDAG-NEXT: v_cndmask_b32_e64 v1, v11, v8, s[6:7]
-; SDAG-NEXT: v_or_b32_e32 v2, v29, v0
-; SDAG-NEXT: v_add_i32_e32 v8, vcc, 32, v3
-; SDAG-NEXT: v_ffbh_u32_e32 v11, v0
+; SDAG-NEXT: v_add_i32_e64 v1, s[6:7], 32, v18
+; SDAG-NEXT: v_add_i32_e64 v18, s[6:7], 32, v23
+; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v11, vcc
+; SDAG-NEXT: v_min_u32_e32 v22, v1, v22
+; SDAG-NEXT: v_min_u32_e32 v18, v18, v28
+; SDAG-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11]
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v11, v23, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v28, v9, v3, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v29, v8, v2, vcc
+; SDAG-NEXT: v_add_i32_e32 v8, vcc, 64, v18
+; SDAG-NEXT: v_addc_u32_e64 v9, s[6:7], 0, 0, vcc
; SDAG-NEXT: v_or_b32_e32 v3, v28, v1
-; SDAG-NEXT: v_min_u32_e32 v8, v8, v19
-; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v11
-; SDAG-NEXT: v_ffbh_u32_e32 v19, v1
+; SDAG-NEXT: v_or_b32_e32 v2, v29, v0
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v22, vcc
+; SDAG-NEXT: v_ffbh_u32_e32 v10, v0
+; SDAG-NEXT: v_ffbh_u32_e32 v11, v1
+; SDAG-NEXT: v_ffbh_u32_e32 v18, v29
+; SDAG-NEXT: v_ffbh_u32_e32 v22, v28
; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
-; SDAG-NEXT: v_min_u32_e32 v2, v11, v19
-; SDAG-NEXT: v_add_i32_e64 v3, s[6:7], 64, v8
-; SDAG-NEXT: v_addc_u32_e64 v8, s[6:7], 0, 0, s[6:7]
-; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[0:1]
-; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, 0, s[6:7]
-; SDAG-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[6:7]
+; SDAG-NEXT: v_add_i32_e64 v2, s[6:7], 32, v10
+; SDAG-NEXT: v_add_i32_e64 v3, s[6:7], 32, v18
+; SDAG-NEXT: v_min_u32_e32 v2, v2, v11
+; SDAG-NEXT: v_min_u32_e32 v3, v3, v22
; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5]
-; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
-; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v8, v9, vcc
+; SDAG-NEXT: v_add_i32_e32 v3, vcc, 64, v3
+; SDAG-NEXT: v_addc_u32_e64 v10, s[4:5], 0, 0, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v8
+; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v10, v9, vcc
; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v2
-; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v18, vcc
-; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[2:3]
-; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v18, vcc
+; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v19, vcc
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[2:3]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5]
+; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v19, vcc
; SDAG-NEXT: v_or_b32_e32 v8, v8, v10
; SDAG-NEXT: v_or_b32_e32 v9, v3, v11
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
-; SDAG-NEXT: v_cndmask_b32_e64 v8, v18, v19, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, v19, v18, s[4:5]
; SDAG-NEXT: v_and_b32_e32 v8, 1, v8
; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8
; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
@@ -1564,67 +1564,67 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 0, v0
; SDAG-NEXT: v_mov_b32_e32 v19, 0
; SDAG-NEXT: v_ashrrev_i32_e32 v28, 31, v3
-; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f
+; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f
; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc
; SDAG-NEXT: v_mov_b32_e32 v29, v28
; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v2, vcc
-; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
-; SDAG-NEXT: v_cndmask_b32_e64 v17, v1, v17, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v16, v0, v16, s[4:5]
-; SDAG-NEXT: v_subb_u32_e32 v1, vcc, 0, v3, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v0, v2, v18, s[4:5]
-; SDAG-NEXT: v_ffbh_u32_e32 v18, v16
-; SDAG-NEXT: v_ffbh_u32_e32 v20, v17
+; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v3, vcc
; SDAG-NEXT: v_sub_i32_e32 v21, vcc, 0, v8
-; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; SDAG-NEXT: v_or_b32_e32 v2, v16, v0
-; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], 32, v18
-; SDAG-NEXT: v_ffbh_u32_e32 v22, v0
-; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v9, vcc
-; SDAG-NEXT: v_or_b32_e32 v3, v17, v1
-; SDAG-NEXT: v_min_u32_e32 v18, v18, v20
-; SDAG-NEXT: v_add_i32_e64 v20, s[4:5], 32, v22
-; SDAG-NEXT: v_ffbh_u32_e32 v22, v1
-; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[10:11]
-; SDAG-NEXT: v_cndmask_b32_e64 v30, v9, v23, s[4:5]
-; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v10, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v31, v8, v21, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[2:3]
-; SDAG-NEXT: v_min_u32_e32 v3, v20, v22
-; SDAG-NEXT: v_add_i32_e64 v8, s[8:9], 64, v18
-; SDAG-NEXT: v_addc_u32_e64 v18, s[8:9], 0, 0, s[8:9]
+; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v20, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v16, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5]
+; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v9, vcc
+; SDAG-NEXT: v_or_b32_e32 v17, v1, v3
+; SDAG-NEXT: v_or_b32_e32 v16, v0, v2
+; SDAG-NEXT: v_ffbh_u32_e32 v20, v2
+; SDAG-NEXT: v_ffbh_u32_e32 v22, v3
+; SDAG-NEXT: v_ffbh_u32_e32 v23, v0
+; SDAG-NEXT: v_ffbh_u32_e32 v24, v1
+; SDAG-NEXT: v_subb_u32_e32 v25, vcc, 0, v10, vcc
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[16:17]
+; SDAG-NEXT: v_add_i32_e64 v16, s[6:7], 32, v20
+; SDAG-NEXT: v_add_i32_e64 v17, s[6:7], 32, v23
; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v11, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v9, s[4:5]
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; SDAG-NEXT: v_min_u32_e32 v16, v16, v22
+; SDAG-NEXT: v_min_u32_e32 v17, v17, v24
+; SDAG-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11]
+; SDAG-NEXT: v_cndmask_b32_e32 v11, v11, v20, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v10, v10, v25, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v30, v9, v18, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v31, v8, v21, vcc
+; SDAG-NEXT: v_add_i32_e32 v17, vcc, 64, v17
+; SDAG-NEXT: v_addc_u32_e64 v18, s[6:7], 0, 0, vcc
+; SDAG-NEXT: v_or_b32_e32 v9, v30, v11
+; SDAG-NEXT: v_or_b32_e32 v8, v31, v10
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, 0, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v10, v8, v3, vcc
-; SDAG-NEXT: v_ffbh_u32_e32 v9, v31
-; SDAG-NEXT: v_ffbh_u32_e32 v21, v30
-; SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v20, s[4:5]
-; SDAG-NEXT: v_or_b32_e32 v8, v31, v2
-; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v9
-; SDAG-NEXT: v_ffbh_u32_e32 v20, v2
-; SDAG-NEXT: v_or_b32_e32 v9, v30, v3
-; SDAG-NEXT: v_min_u32_e32 v11, v11, v21
-; SDAG-NEXT: v_add_i32_e32 v20, vcc, 32, v20
-; SDAG-NEXT: v_ffbh_u32_e32 v21, v3
+; SDAG-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc
+; SDAG-NEXT: v_ffbh_u32_e32 v17, v10
+; SDAG-NEXT: v_ffbh_u32_e32 v20, v11
+; SDAG-NEXT: v_ffbh_u32_e32 v21, v31
+; SDAG-NEXT: v_ffbh_u32_e32 v22, v30
; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
-; SDAG-NEXT: v_min_u32_e32 v8, v20, v21
-; SDAG-NEXT: v_add_i32_e64 v9, s[4:5], 64, v11
-; SDAG-NEXT: v_addc_u32_e64 v11, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[2:3]
-; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, 0, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v8, v9, v8, s[4:5]
-; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v8, v10
-; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v18, vcc
-; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v10
+; SDAG-NEXT: v_add_i32_e64 v8, s[6:7], 32, v17
+; SDAG-NEXT: v_add_i32_e64 v9, s[6:7], 32, v21
+; SDAG-NEXT: v_min_u32_e32 v8, v8, v20
+; SDAG-NEXT: v_min_u32_e32 v9, v9, v22
+; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5]
+; SDAG-NEXT: v_add_i32_e32 v9, vcc, 64, v9
+; SDAG-NEXT: v_addc_u32_e64 v17, s[4:5], 0, 0, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc
+; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v8, v16
+; SDAG-NEXT: v_subb_u32_e32 v17, vcc, v17, v18, vcc
+; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v16
; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v19, vcc
-; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11]
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[16:17]
; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5]
; SDAG-NEXT: v_subb_u32_e32 v19, vcc, 0, v19, vcc
; SDAG-NEXT: v_or_b32_e32 v8, v8, v18
-; SDAG-NEXT: v_or_b32_e32 v9, v11, v19
+; SDAG-NEXT: v_or_b32_e32 v9, v17, v19
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
@@ -1633,72 +1633,72 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_and_b32_e32 v8, 1, v8
; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8
; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v34, v1, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v34, v3, 0, s[4:5]
; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
-; SDAG-NEXT: v_cndmask_b32_e64 v32, v0, 0, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v27, v17, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v32, v2, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v27, v1, 0, s[4:5]
; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v33, v16, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v33, v0, 0, s[4:5]
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9]
; SDAG-NEXT: s_cbranch_execz .LBB2_6
; SDAG-NEXT: ; %bb.1: ; %udiv-bb15
-; SDAG-NEXT: v_add_i32_e32 v32, vcc, 1, v10
-; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v10
+; SDAG-NEXT: v_add_i32_e32 v32, vcc, 1, v16
+; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v16
; SDAG-NEXT: v_mov_b32_e32 v8, 0
; SDAG-NEXT: v_mov_b32_e32 v9, 0
-; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc
-; SDAG-NEXT: v_lshl_b64 v[20:21], v[16:17], v20
+; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v17, vcc
+; SDAG-NEXT: v_lshl_b64 v[20:21], v[0:1], v20
; SDAG-NEXT: v_addc_u32_e32 v34, vcc, 0, v18, vcc
; SDAG-NEXT: v_addc_u32_e32 v35, vcc, 0, v19, vcc
-; SDAG-NEXT: v_or_b32_e32 v18, v32, v34
-; SDAG-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v10
-; SDAG-NEXT: v_or_b32_e32 v19, v33, v35
-; SDAG-NEXT: v_lshl_b64 v[10:11], v[0:1], v24
-; SDAG-NEXT: v_sub_i32_e32 v25, vcc, 64, v24
-; SDAG-NEXT: v_lshl_b64 v[22:23], v[16:17], v24
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
-; SDAG-NEXT: v_lshr_b64 v[18:19], v[16:17], v25
-; SDAG-NEXT: v_or_b32_e32 v11, v11, v19
-; SDAG-NEXT: v_or_b32_e32 v10, v10, v18
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24
-; SDAG-NEXT: v_cndmask_b32_e64 v11, v21, v11, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v10, v20, v10, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v23, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v22, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24
-; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, v1, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v10, v10, v0, s[4:5]
+; SDAG-NEXT: v_or_b32_e32 v17, v32, v34
+; SDAG-NEXT: v_sub_i32_e32 v19, vcc, 0x7f, v16
+; SDAG-NEXT: v_or_b32_e32 v18, v33, v35
+; SDAG-NEXT: v_lshl_b64 v[22:23], v[2:3], v19
+; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 64, v19
+; SDAG-NEXT: v_lshl_b64 v[24:25], v[0:1], v19
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[17:18]
+; SDAG-NEXT: v_lshr_b64 v[16:17], v[0:1], v16
+; SDAG-NEXT: v_or_b32_e32 v17, v23, v17
+; SDAG-NEXT: v_or_b32_e32 v16, v22, v16
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v19
+; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, v17, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v16, v20, v16, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v25, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v24, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19
+; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v3, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v2, s[4:5]
; SDAG-NEXT: v_mov_b32_e32 v18, 0
; SDAG-NEXT: v_mov_b32_e32 v19, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; SDAG-NEXT: s_cbranch_execz .LBB2_5
; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4
-; SDAG-NEXT: v_lshr_b64 v[8:9], v[16:17], v32
+; SDAG-NEXT: v_lshr_b64 v[8:9], v[0:1], v32
; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 64, v32
; SDAG-NEXT: v_subrev_i32_e32 v37, vcc, 64, v32
-; SDAG-NEXT: v_lshr_b64 v[24:25], v[0:1], v32
+; SDAG-NEXT: v_lshr_b64 v[24:25], v[2:3], v32
; SDAG-NEXT: v_add_i32_e32 v36, vcc, -1, v31
; SDAG-NEXT: v_mov_b32_e32 v18, 0
; SDAG-NEXT: v_mov_b32_e32 v19, 0
; SDAG-NEXT: v_mov_b32_e32 v22, 0
; SDAG-NEXT: v_mov_b32_e32 v23, 0
; SDAG-NEXT: s_mov_b64 s[10:11], 0
-; SDAG-NEXT: v_lshl_b64 v[26:27], v[0:1], v26
-; SDAG-NEXT: v_lshr_b64 v[48:49], v[0:1], v37
+; SDAG-NEXT: v_lshl_b64 v[26:27], v[2:3], v26
+; SDAG-NEXT: v_lshr_b64 v[48:49], v[2:3], v37
; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v30, vcc
; SDAG-NEXT: v_or_b32_e32 v9, v9, v27
; SDAG-NEXT: v_or_b32_e32 v8, v8, v26
-; SDAG-NEXT: v_addc_u32_e32 v38, vcc, -1, v2, vcc
+; SDAG-NEXT: v_addc_u32_e32 v38, vcc, -1, v10, vcc
; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v32
; SDAG-NEXT: v_cndmask_b32_e64 v9, v49, v9, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v8, v48, v8, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v27, 0, v25, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v26, 0, v24, s[4:5]
-; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v3, vcc
+; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v11, vcc
; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32
-; SDAG-NEXT: v_cndmask_b32_e32 v25, v9, v17, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v24, v8, v16, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v25, v9, v1, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v24, v8, v0, vcc
; SDAG-NEXT: v_mov_b32_e32 v9, 0
; SDAG-NEXT: .LBB2_3: ; %udiv-do-while3
; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -1707,13 +1707,13 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_lshl_b64 v[26:27], v[26:27], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v48, 31, v25
; SDAG-NEXT: v_lshl_b64 v[24:25], v[24:25], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v49, 31, v11
-; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v49, 31, v17
+; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1
; SDAG-NEXT: v_or_b32_e32 v21, v23, v21
; SDAG-NEXT: v_or_b32_e32 v20, v22, v20
; SDAG-NEXT: v_or_b32_e32 v22, v26, v48
; SDAG-NEXT: v_or_b32_e32 v23, v24, v49
-; SDAG-NEXT: v_or_b32_e32 v10, v10, v8
+; SDAG-NEXT: v_or_b32_e32 v16, v16, v8
; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v36, v23
; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v37, v25, vcc
; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v38, v22, vcc
@@ -1721,8 +1721,8 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_ashrrev_i32_e32 v8, 31, v8
; SDAG-NEXT: v_and_b32_e32 v24, v8, v31
; SDAG-NEXT: v_and_b32_e32 v26, v8, v30
-; SDAG-NEXT: v_and_b32_e32 v48, v8, v2
-; SDAG-NEXT: v_and_b32_e32 v49, v8, v3
+; SDAG-NEXT: v_and_b32_e32 v48, v8, v10
+; SDAG-NEXT: v_and_b32_e32 v49, v8, v11
; SDAG-NEXT: v_and_b32_e32 v8, 1, v8
; SDAG-NEXT: v_sub_i32_e32 v24, vcc, v23, v24
; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v25, v26, vcc
@@ -1735,9 +1735,9 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_or_b32_e32 v22, v32, v34
; SDAG-NEXT: v_or_b32_e32 v23, v33, v35
; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[22:23]
-; SDAG-NEXT: v_or_b32_e32 v11, v19, v11
+; SDAG-NEX...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Pull Request Overview
This PR adds explicit freezing for LowerSELECT
by adjusting how 64-bit selects (especially those involving undef
) are legalized into paired 32-bit selects, and updates the corresponding AMDGPU codegen tests to match the new instruction sequences.
- Introduce freezing behavior when lowering
select
withundef
inputs and add a dedicated test to verify it. - Update dozens of AMDGPU FileCheck patterns for f64 rounding, comparisons, and select lowering to use new
v_and_b32_e32
,v_cmp_le_f64_e32
, ands_and_b64
sequences. - Adjust srem legalization tests (
srem64.ll
,srem.ll
) to reflect reordered and merged 32-bit legalizer outputs.
Reviewed Changes
Copilot reviewed 14 out of 14 changed files in this pull request and generated no comments.
Show a summary per file
File | Description |
---|---|
llvm/test/CodeGen/AMDGPU/srem64.ll | Reorder and merge 32-bit shift/add/xor sequences |
llvm/test/CodeGen/AMDGPU/srem.ll | Fix backward select lowering in srem_i64 sequences |
llvm/test/CodeGen/AMDGPU/select-undef.ll | Add a new test for freezing undef -based selects |
llvm/test/CodeGen/AMDGPU/roundeven.ll | Update f64 rounding checks to use v_and /v_cmp_lt |
llvm/test/CodeGen/AMDGPU/lround.ll | Switch to v_cmp_le_f64_e32 and v_and_b32_e32 |
llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll | Adjust SI/CI patterns for rounding threshold logic |
llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll | Change v_cmp_gt_f64_e64 to v_cmp_gt_f64_e32 |
llvm/test/CodeGen/AMDGPU/llvm.frexp.ll | Insert v_and_b32_e32 + reorder frexp exp/mant calls |
llvm/test/CodeGen/AMDGPU/fract-match.ll | Update fract/floor patterns to v_and + v_cmp_neq |
llvm/test/CodeGen/AMDGPU/fnearbyint.ll | Add s_bitset0_b32 + switch cmp to e32 form |
llvm/test/CodeGen/AMDGPU/fminimum3.ll | Rewrite fabs/min chains with v_and + e32 cmp |
llvm/test/CodeGen/AMDGPU/fmaximum3.ll | Rewrite fabs/max chains with v_and + e32 cmp |
Comments suppressed due to low confidence (1)
llvm/test/CodeGen/AMDGPU/select-undef.ll:859
- This select uses an
undef
operand without any precedingfreeze
. To ensure legalization always preserves well-defined values, consider inserting an explicitllvm.freeze
before the select, and add a FileCheck pattern verifying the freeze in the test.
%cond2 = select i1 %cond1, i32 %undef, i32 %val
|
||
SDLoc DL(Op); | ||
SDValue Cond = Op.getOperand(0); | ||
|
||
if (Cond.getOpcode() == ISD::SETCC) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It doesn't matter what the source operation is, the freeze is potentially needed anyway (it's really the non-compare cases that are most interesting here)
✅ With the latest revision this PR passed the C/C++ code formatter. |
✅ With the latest revision this PR passed the undef deprecator. |
@@ -10099,6 +10099,15 @@ define <2 x i64> @srem_zero_zero() { | |||
; GCN-LABEL: kernel: | |||
; GCN: ; %bb.0: ; %entry | |||
; GCN-NEXT: s_endpgm |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
redundant check?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I meant the GCN checks - the GFX6/9 auto generated checks looked find
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
updated~~ Thanks~
fix shilei's comment Co-authored-by: Shilei Tian <[email protected]>
; GFX6-LABEL: srem_zero_zero: | ||
; GFX6: ; %bb.0: ; %entry | ||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
; GFX6-NEXT: s_setpc_b64 s[30:31] | ||
; | ||
; GFX9-LABEL: srem_zero_zero: | ||
; GFX9: ; %bb.0: ; %entry | ||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
; GFX9-NEXT: s_setpc_b64 s[30:31] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why did these split when the output is identical?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There isn't currently a common GCN prefix for the llc RUNs - I think it got removed when the file was regenerated and we didn't have any uses of it.
; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 | ||
; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] | ||
; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 | ||
; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, v[2:3] | ||
; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] | ||
; GFX9-NEXT: s_nop 1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you make sure all of these FP ops correctly report they can't introduce poison in a separate PR?
Trying to solve #147635
Add freeze for legalizer when breaking i64 select to 2 i32 select.
Several tests changed, still need to investigate why.